Source code for aiida_cp2k.utils.datatype_helpers

###############################################################################
# Copyright (c), The AiiDA-CP2K authors.                                      #
# SPDX-License-Identifier: MIT                                                #
# AiiDA-CP2K is hosted on GitHub at https://github.com/aiidateam/aiida-cp2k   #
# For further information on the license, see the LICENSE.txt file.           #
###############################################################################
"""AiiDA-CP2K Gaussian Datatype Helpers."""

import re
from collections.abc import Sequence

from aiida.common import InputValidationError
from aiida.plugins import DataFactory


[docs]def _unpack(adict):
    """Unpack any lists as values into single elements for the key"""

    for key, value in adict.items():
        if isinstance(value, Sequence):
            for item in value:
                yield (key, item)
        else:
            yield (key, value)


[docs]def _parse_name(label, default_type, sep=None):
    """
    Both BASIS_SET and POTENTIAL values can consist of either a single word or multiple ones,
    of which the first will be the type (if present).  Here we parse it and always return a tuple.
    """

    try:
        ltype, label = label.split(sep=sep, maxsplit=1)
    except ValueError:
        ltype = default_type

    return ltype, label


ELEMENT_MATCH = re.compile(r"(?P<sym>[a-z]{1,3})\d*", re.IGNORECASE)


[docs]def _kind_element_from_kind_section(section):
    """
    Get both kind and chemical symbol from a section, implementing
    the same auto-detection for chemical symbol/element from a KIND parameter
    as CP2K does.
    """
    try:
        kind = section["_"]
    except KeyError:
        raise InputValidationError("No default parameter '_' found in KIND section.")

    try:
        element = section["ELEMENT"]
    except KeyError:
        # if there is no ELEMENT, CP2K automatically guesses it from the KIND, do the same
        match = ELEMENT_MATCH.match(kind)
        try:
            element = match["sym"]
        except TypeError:
            raise InputValidationError(
                f"Unable to figure out atomic symbol from KIND '{kind}'."
            )

    return kind, element


[docs]def _prepare_kind_section(inp, kind):
    """
    Insert a KIND section for a given 'StructureData.Kind'.
    Returns a reference to the newly created KIND section.
    """

    if "SUBSYS" not in inp["FORCE_EVAL"]:
        inp["FORCE_EVAL"]["SUBSYS"] = {}

    if "KIND" not in inp["FORCE_EVAL"]["SUBSYS"]:
        inp["FORCE_EVAL"]["SUBSYS"]["KIND"] = []

    inp["FORCE_EVAL"]["SUBSYS"]["KIND"].append(
        {
            "_": kind.name,
            "ELEMENT": kind.symbol,
        }
    )

    return inp["FORCE_EVAL"]["SUBSYS"]["KIND"][-1]


[docs]def _validate_gdt_namespace(entries, gdt_cls, attr):
    """Common namespace validator for both basissets and pseudos"""

    identifiers = []

    for kind, gdt_instance in _unpack(entries):
        if not isinstance(gdt_instance, gdt_cls):
            return f"invalid {attr} for '{kind}' specified"

        identifier = (gdt_instance.element, gdt_instance.name)

        if identifier in identifiers:
            # note: this should be possible for basissets with different versions
            #       but at this point we should require some format for the key to match it
            return f"{attr} for kind {gdt_instance.element} ({gdt_instance.name}) specified multiple times"

        identifiers += [identifier]

    return None


[docs]def _write_gdt(inp, entries, folder, key, fname):
    """inject <key>=<fname> into all FORCE_EVAL/DFT sections and write the entries to a file"""

    for secpath, section in inp.param_iter(sections=True):
        if secpath[-1].upper() == "DFT":
            section[key] = fname

    with open(folder.get_abs_path(fname), mode="w", encoding="utf-8") as fhandle:
        for _, entry in _unpack(entries):
            entry.to_cp2k(fhandle)


[docs]def validate_basissets_namespace(basissets, _):
    """A input_namespace validator to ensure passed down basis sets have the correct type."""
    return _validate_gdt_namespace(
        basissets, DataFactory("gaussian.basisset"), "basis set"
    )


[docs]def validate_basissets(inp, basissets, structure):
    """
    Verify that all referenced basissets are present in the input.
    Currently supports 2 modes: either all of the basisssets are explicitly
    listed in a KIND section, or none of them are, at which point they're
    verified against the symbols in the structure.
    """
    # pylint: disable=too-many-branches,too-many-statements

    # convert a structure
    #   {
    #      "ORB_O": [BasisSet<1>, BasisSet<2>],
    #      "AUX_O": BasisSet<3>,
    #      "H": BasisSet<4>,
    #   }
    # into
    #  [ ("ORB", "O", BasisSet<1>),
    #    ("ORB", "O", BasisSet<2>),
    #    ("AUX", "O", BasisSet<3>),
    #    ("ORB", "H", BasisSet<4>) ]
    # e.g. resolving any label to a (type,label) tuple, and unpack any list of basissets
    basissets = [
        (*_parse_name(label, default_type="ORB", sep="_"), bset)
        for label, bset in _unpack(basissets)
    ]
    basissets_specified = {bset for _, _, bset in basissets}
    basissets_used = set()
    explicit_kinds = []  # list of kinds with explicitly specified kind sections

    for section in (
        section
        for secpath, section in inp.param_iter(sections=True)
        if secpath[-1].upper() == "KIND"
    ):
        kind, element = _kind_element_from_kind_section(section)
        explicit_kinds += [kind]

        try:
            bsnames = section["BASIS_SET"]
        except KeyError:
            # if the BASIS_SET keyword is not present, try to look one up based on the given basissets
            bsets = [(t, b) for t, s, b in basissets if s == kind]

            # try again with lov.. with a chemical symbol
            if not bsets:
                bsets = [(t, b) for t, s, b in basissets if s == element]

            if not bsets:
                raise InputValidationError(
                    f"No basis set found for kind {kind} or element {element}"
                    f" in basissets input namespace and not explicitly set."
                )

            if len(bsets) > 1:
                section["BASIS_SET"] = [f"bstype {bset.name}" for bstype, bset in bsets]
            else:
                section["BASIS_SET"] = f"{bsets[0][0]} {bsets[0][1].name}"

            basissets_used.update(bset for _, bset in bsets)
        else:
            # The keyword BASIS_SET can occur multiple times, even for the same type, in which case
            # the specified basis sets are merged (given they match the same type)
            if isinstance(bsnames, str):
                bsnames = [_parse_name(bsnames, "ORB")]
            else:
                bsnames = [_parse_name(bsname, "ORB") for bsname in bsnames]

            for bstype, bsname in bsnames:
                bsets = [(t, b) for t, s, b in basissets if s == kind]

                # try again with a chemical symbol
                if not bsets:
                    bsets = [(t, b) for t, s, b in basissets if s == element]

                if not bsets:
                    raise InputValidationError(
                        f"'BASIS_SET {bstype} {bsname}' for element {element} (from kind {kind})"
                        " not found in basissets input namespace"
                    )

                for _, bset in bsets:
                    if bsname in bset.aliases:
                        basissets_used.add(bset)
                        break
                else:
                    raise InputValidationError(
                        f"'BASIS_SET {bstype} {bsname}' for element {element} (from kind {kind})"
                        " not found in basissets input namespace"
                    )

    # if there is no structure and there are any unreferenced basissets, end it here
    if not structure and any(
        bset not in basissets_used for bset in basissets_specified
    ):
        raise InputValidationError(
            "No explicit structure given and basis sets not referenced in input"
        )

    if isinstance(inp["FORCE_EVAL"], Sequence) and any(
        kind.name not in explicit_kinds for kind in structure.kinds
    ):
        raise InputValidationError(
            "Automated BASIS_SET keyword creation is not yet supported with multiple FORCE_EVALs."
            " Please explicitly reference a BASIS_SET for each KIND."
        )

    # check the structure against the present KIND sections and generate the missing ones
    for kind in structure.kinds:
        if kind.name in explicit_kinds:
            # nothing to do if the user already specified a KIND section for this KIND
            continue

        # the user can specify multiple types and even multiple basissets for the same KIND or ELEMENT
        # Try to find all of them by matching KIND name

        bsets = [(t, b) for t, s, b in basissets if s == kind.name]

        # if that returned none, try matching by chemical symbol/element again:
        if not bsets:
            bsets = [(t, b) for t, s, b in basissets if s == kind.symbol]

        if not bsets:
            raise InputValidationError(
                f"No basis set found in the given basissets for kind '{kind.name}' of your structure."
            )

        for _, bset in bsets:
            if bset.element != kind.symbol:
                raise InputValidationError(
                    f"Basis set '{bset.name}' for '{bset.element}' specified"
                    f" for kind '{kind.name}' (of '{kind.symbol}')."
                )

        kind_section = _prepare_kind_section(inp, kind)
        if len(bsets) > 1:
            kind_section["BASIS_SET"] = [
                f"{bstype} {bset.name}" for bstype, bset in bsets
            ]
        else:
            kind_section["BASIS_SET"] = f"{bsets[0][0]} {bsets[0][1].name}"

        explicit_kinds += [kind.name]
        basissets_used.update(bset for _, bset in bsets)

    for bset in basissets_specified:
        if bset not in basissets_used:
            raise InputValidationError(
                f"Basis set '{bset.name}' ('{bset.element}') specified in the basissets"
                f" input namespace but not referenced by either input or structure."
            )


[docs]def write_basissets(inp, basissets, folder):
    """Writes the unified BASIS_SETS file with the used basissets"""
    _write_gdt(inp, basissets, folder, "BASIS_SET_FILE_NAME", "BASIS_SETS")


[docs]def validate_pseudos_namespace(pseudos, _):
    """A input_namespace validator to ensure passed down pseudopentials have the correct type."""
    return _validate_gdt_namespace(pseudos, DataFactory("gaussian.pseudo"), "pseudo")


[docs]def validate_pseudos(inp, pseudos, structure):
    """Verify that all referenced pseudos are present in the input"""

    # pylint: disable=too-many-branches,too-many-statements

    pseudos_specified = {pseudo for _, pseudo in _unpack(pseudos)}
    pseudos_used = set()
    explicit_kinds = []  # list of kinds with explicitly specified kind sections

    for section in (
        section
        for secpath, section in inp.param_iter(sections=True)
        if secpath[-1].upper() == "KIND"
    ):
        kind, element = _kind_element_from_kind_section(section)
        explicit_kinds += [kind]

        try:
            pname = section["POTENTIAL"]
        except KeyError:
            # if the POTENTIAL keyword is not present, try to look one up based on given pseudos
            try:
                # first try with the KIND since this is the most specific one
                # NOTE: compared to basissets it doesn't make sense for the user to specify the type
                #       since the type of a pseudo can not be chosen (it is either an GTH, ECP, STO, etc.)
                pseudo = pseudos[kind]
            except KeyError:
                try:
                    pseudo = pseudos[element]
                except KeyError:
                    raise InputValidationError(
                        f"No pseudopotential found for kind {kind} or element {element}"
                        f" in pseudos input namespace and not explicitly set."
                    )

            # if the POTENTIAL keyword is missing completely, fill it up:
            section["POTENTIAL"] = f"GTH {pseudo.name}"
        else:
            ptype, pname = _parse_name(pname, "GTH")

            try:
                # first try with the KIND since this is the most specific one
                pseudo = pseudos[kind]
            except KeyError:
                try:
                    pseudo = pseudos[element]
                except KeyError:
                    raise InputValidationError(
                        f"'POTENTIAL {ptype} {pname}' for element {element} (from kind {kind})"
                        " not found in pseudos input namespace"
                    )

            if pname not in pseudo.aliases:
                raise InputValidationError(
                    f"'POTENTIAL {ptype} {pname}' for element {element} (from kind {kind})"
                    " not found in pseudos input namespace"
                )

        if pseudo.element != element:
            raise InputValidationError(
                f"Pseudopotential '{pseudo.name}' for '{pseudo.element}' specified"
                f" for element '{element}'."
            )

        pseudos_used.add(pseudo)

    # if there is no structure and there are any unreferenced pseudos, end it here
    if not structure and any(
        pseudo not in pseudos_used for pseudo in pseudos_specified
    ):
        raise InputValidationError(
            "No explicit structure given and pseudo not referenced in input"
        )

    if isinstance(inp["FORCE_EVAL"], Sequence) and any(
        kind.name not in explicit_kinds for kind in structure.kinds
    ):
        raise InputValidationError(
            "Automated POTENTIAL keyword creation is not yet supported with multiple FORCE_EVALs."
            " Please explicitly reference a POTENTIAL for each KIND."
        )

    # check the structure against the present KIND sections and generate the missing ones
    for kind in structure.kinds:
        if kind.name in explicit_kinds:
            # nothing to do if the user already specified a KIND section for this KIND
            continue

        try:
            pseudo = pseudos[kind.name]
        except KeyError:
            # if that returned none, try matching by chemical symbol/element again:
            try:
                pseudo = pseudos[kind.symbol]
            except KeyError:
                raise InputValidationError(
                    f"No basis set found in the given basissets"
                    f" for kind '{kind.name}' (or '{kind.symbol}') of your structure."
                )

        if pseudo.element != kind.symbol:
            raise InputValidationError(
                f"Pseudopotential '{pseudo.name}' for '{pseudo.element}' specified"
                f" for kind '{kind.name}' (of '{kind.symbol}')."
            )

        kind_section = _prepare_kind_section(inp, kind)
        kind_section["POTENTIAL"] = f"GTH {pseudo.name}"

        explicit_kinds += [kind.name]
        pseudos_used.add(pseudo)

    for pseudo in pseudos_specified:
        if pseudo not in pseudos_used:
            raise InputValidationError(
                f"Pseudopodential '{pseudo.name}' specified in the pseudos input namespace"
                f" but not referenced by either input or structure."
            )


[docs]def write_pseudos(inp, pseudos, folder):
    """Writes the unified POTENTIAL file with the used pseudos"""
    _write_gdt(inp, pseudos, folder, "POTENTIAL_FILE_NAME", "POTENTIAL")