Source code for Scripts.DatabankLib.form_factor

"""
Module form_factor serves for bilayer form factor calculations.
"""

import sys
import os
import re
import MDAnalysis as mda
import numpy as np
import json
import time
from pprint import pprint
from tqdm import tqdm

# for testing purposes to safe time for loading trajectory and creating dictonary
# the electron.dat will be decripted in the future as the values will be used from
# the universal mapping file
import periodictable

from DatabankLib.core import System
from DatabankLib.databankLibrary import lipids_set, getLipids
from DatabankLib.jsonEncoders import CompactJSONEncoder
from DatabankLib import NMLDB_SIMU_PATH, NMLDB_DATA_PATH


# To write data in numpy arrays into json file, we inherit compact JSON
# encoder to make him store 2xN numpy arrays as just a nested list

[docs]
class NumpyArrayEncoder(CompactJSONEncoder):

[docs]
    def encode(self, o):
        if isinstance(o, np.ndarray):
            return CompactJSONEncoder.encode(self, o.tolist())
        else:
            return CompactJSONEncoder.encode(self, o)





[docs]
class FormFactor:
    """
    Calculates form factors from density profiles.

    Further development will include thickness of the membrane
    from the intersection of lipid density and water density.
    Already enables to calculate electrom/mass/number densities

    Density could be calculated only from the final form factor profile
    - testing is needed to see stability for rare species.

    Examination of FF error spikes is needed!
    """

    # Group of static private helper methods

    @staticmethod
    def __list_name_pairs(mapping_dict, molecule):
        pairs_dictionary = {}
        residues = [mv['RESIDUE']
                    for _, mv in mapping_dict.items()
                    if 'RESIDUE' in mv]

        if residues:
            pairs_dictionary = dict.fromkeys(residues)
            print(pairs_dictionary.keys())
            for res in pairs_dictionary.keys():
                pairs_dictionary[res] = [
                    [mk, mv['ATOMNAME']]
                    for mk, mv in mapping_dict.items()
                    if res == mv['RESIDUE']
                    ]
        else:
            pairs_dictionary = {
                molecule: [[mn, mv['ATOMNAME']] for mn, mv in mapping_dict.items()]
            }

        return pairs_dictionary

    @staticmethod
    def __filter_hbonds(mapping_names):
        re_h = re.compile(r'M_([A-Z]{1,2}[0-9]{1,4})*H[0-4]{1,2}_M')
        filtered = filter(re_h.match, mapping_names)
        return filtered

    @staticmethod
    def __get_water(rmf):
        return ('resname ' + rmf['COMPOSITION']['SOL']['NAME'])

    # -- regular methods --

    def __init__(self, conf, traj, nbin, output,
                 system: System, density_type="electron"):
        self.conf = conf
        self.traj = traj
        self.system = system
        self.__path = os.path.join(NMLDB_SIMU_PATH, system['path'])
        start_time = time.time()
        try:
            self.u = mda.Universe(self.conf, self.traj)
            print()
        except Exception as e:
            if (self.conf[-3:] == 'gro'):
                raise e
            # assume conf was TPR
            gro = self.__path + os.sep + 'conf.gro'
            print("Generating conf.gro because MDAnalysis cannot read tpr version")
            os.system(f"echo System | gmx trjconv "
                      f"-s {self.conf} -f {self.traj} -dump 0 -o {gro}")
            self.conf = gro
            self.u = mda.Universe(self.conf, self.traj)

        print("Loading the trajectory takes {:10.6f} s".format(time.time()-start_time))

        # number of bins
        self.nbin = nbin
        # the totatl box size in [nm] - will be probably removed and tpr box size or
        # transform of final FF will be used instead

        self.output = os.path.join(self.__path, output)
        self.lipids = getLipids(self.system)
        self.waters = FormFactor.__get_water(self.system)

        self.density_type = density_type

        self.calculate_weight()
        self.calculate_density()


[docs]
    @staticmethod
    def get_electrons(mapping_name):
        # removes numbers and '_M' from the end of string
        name1 = re.sub(r"[0-9]*_M$", "", mapping_name)
        # removes M_X12Y23... sequence from the beginning
        name2 = re.sub(r"^M_([A-Z]{1,2}[0-9]{1,4})*", "", name1)

        # name2 is the atom and electrons are assigned to this atom

        if name2 == "G":  # G is a glycerol carbon so change G to C
            name2 = "C"

        try:
            el = getattr(periodictable, name2).number
        except AttributeError:
            print(
                f"ERROR: This mapping name cannot be read by our rules: {mapping_name}",
                file=sys.stderr,
            )
            print("Consider changing naming in your mapping file.")
            sys.exit(1)

        # TODO Modify default charge modification
        if name2 in ['K', 'Na', 'Cs']:
            el -= 1
        elif name2 in ['Cl']:
            el += 1
        #

        # TODO REMOVE WEIRD HERITAGE!!!
        if name2 == "D":
            el = 0
        # WEIRD HERITAGE!!!

        return el



[docs]
    def residue_electrons_all(self, molecule):
        """
        Return list of electrons
        """
        print(f"Gathering electron list for {molecule}")
        electrons = []
        # get the name of molecule used in simulation files
        molname = self.system['COMPOSITION'][molecule]['NAME']

        pairs_residue = FormFactor.__list_name_pairs(
            self.system.content[molecule].mapping_dict, molname)

        # if lipid is split to multiple residues
        selection_txt = ""

        try:
            selection_txt = (
                "moltype " +
                self.system['COMPOSITION'][molecule]['MOLTYPE'] +
                " and ")
        except KeyError:
            pass

        for res in pairs_residue.keys():  # fix second res in explicit_atoms
            # TODO: REPLACE THIS ENTIRE CODE WITH MDANALYSIS
            # when lipids are split into more than one residue in topology with
            # same residue names e.g. PGR,PA,OL and PE, PA,OL then
            # `u.select_atoms("resname " + res)` does not distinguish
            # which PA or PL belongs to POPG or POPE
            selection_txt = selection_txt + "resname " + res
            # explicit atoms of the residue
            explicit_atoms = list(self.u.select_atoms(selection_txt).atoms.names)
            for atom in explicit_atoms:
                # find generic mapping name matching to forcefield atom name
                atom_i1 = None
                for i in range(len(pairs_residue[res])):
                    _cm = pairs_residue[res][i]  # [UNIQUE_NAME, NAME] pair
                    if re.match(_cm[1].replace('+', '\\+'), atom):
                        atom_i1 = i
                        break
                if atom_i1 is None:
                    raise ValueError(f"Atom was not found: {res}:{atom}")
                # # find mapping name
                # index_list = [atom in pairs for pairs in pairs_residue[res]]
                # print (atom, pairs_residue[res])
                # atom_i1 = index_list.index(True)

                mapping_name = pairs_residue[res][atom_i1][0]
                # get number of electrons in an atom i of residue
                e_atom_i = FormFactor.get_electrons(mapping_name)
                electrons.append(e_atom_i)

        return electrons



[docs]
    def assign_electrons(self):
        # check if simulation is united atom or all atom
        try:
            ua_flag = self.system['UNITEDATOM_DICT']
        except KeyError:
            ua_flag = False
        else:
            ua_flag = True

        weights = []
        l_weights = []  # separate list for lipid electrons
        w_weights = []  # separate list for water electrons

        if ua_flag:  # UNITED ATOM SIMULATIONS
            for molecule1 in self.system['COMPOSITION'].keys():
                print(molecule1)

                if molecule1 in lipids_set:
                    molecule2 = self.system['COMPOSITION'][molecule1]['NAME']
                    electrons = []

                    pairs_residue = FormFactor.__list_name_pairs(
                        self.system.content[molecule2].mapping_dict, molecule2)
                    h_atoms = FormFactor.__filter_hbonds(
                        self.system.content[molecule2].mapping_dict.keys())
                    # list of hydrogen atoms

                    # extract explicit atoms and get the mapping names
                    for res in pairs_residue.keys():
                        # explicit atoms of the residue
                        explicit_atoms = list(self.u.select_atoms(
                            "resname " + res).atoms.names)
                        for atom in explicit_atoms:
                            # print(atom)
                            # find generic mapping name matching to forcefield atom name
                            index_list = [atom in pairs for pairs in pairs_residue[res]]
                            atom_i1 = index_list.index(True)
                            mapping_name = pairs_residue[res][atom_i1][0]
                            # remove _M from the end of mapping name
                            name1 = re.sub(r'_M', '', mapping_name[::1])

                            # count how many times name1 occurs in atomsH i.e. how
                            # many H are bound to it
                            h_num = sum(
                                1 for h_atm in h_atoms
                                if name1 == re.sub(r'H[1-4]_M', '', h_atm[::1])
                                )
                            number_e = FormFactor.get_electrons(mapping_name)
                            e_atom_i = number_e + h_num
                            electrons.append(e_atom_i)

                    weights.extend(electrons)
                    l_weights.extend(electrons)
                else:
                    w = self.residue_electrons_all(molecule1)
                    weights.extend(w)
                    if molecule1 == 'SOL':
                        w_weights.extend(w)

        else:  # ALL ATOM SIMULATIONS
            for molecule in self.system['COMPOSITION'].keys():
                w = self.residue_electrons_all(molecule)
                weights.extend(w)
                if molecule in lipids_set:
                    l_weights.extend(w)
                elif molecule == 'SOL':
                    w_weights.extend(w)
        # how to do lipids split into three residues??    MAYBE WORKS

        return weights, l_weights, w_weights



[docs]
    def calculate_weight(self):
        """
        Creates an array of weights for all atoms in the simulation.

        For electron densities:
         - creates dictonary of atom types and number of electrons
           loaded form an external file --> in the future it backmaps
                                            the atom names to mapping files
                                            and automaticaly assigns # of electrons

        Number densities:
         - all weights are 1

        Mass densities:
         - reads masses from u.atoms.masses
        """
        start_time = time.time()
        if self.density_type == "electron":
            # calc weights to calculate the electron density
            # with the function that maps the electrons to atoms
            self.wght, self.lipid_wght, self.water_wght = self.assign_electrons()
            print("lenght of wght: " + str(len(self.wght)))
            print("lenght of lipid_wght: " + str(len(self.lipid_wght)))
            print("atoms in system: " + str(len(self.u.atoms.names)))

            if len(self.wght) != len(self.u.atoms.names):
                print("ERROR: Number of atoms mismatch")
                print("ERROR: If lipids are split to many residues add moltype "
                      "of lipids from tpr as 'MOLTYPE' to README.yaml "
                      "under 'COMPOSITION'.")

        if self.density_type == "number":
            self.wght = np.ones(self.u.atoms.names.shape[0])
            clipids = self.u.select_atoms(self.lipids)
            self.lipid_wght = np.ones(clipids.atoms.names.shape[0])
            cwaters = self.u.select_atoms(self.waters)
            self.lipid_wght = np.ones(cwaters.atoms.names.shape[0])

        if self.density_type == "mass":
            self.wght = self.u.atoms.masses
            clipids = self.u.select_atoms(self.lipids)
            self.lipid_wght = np.ones(clipids.atoms.names.shape[0])
            cwaters = self.u.select_atoms(self.waters)
            self.lipid_wght = np.ones(cwaters.atoms.names.shape[0])

        print("Creating the electron mapping dictonary takes {:10.6f} s"
              .format(time.time()-start_time))



[docs]
    def calculate_density(self):
        c = self.u.select_atoms(self.lipids)
        print(c)  # TODO: remove excessive debug printing!

        box_z = self.u.dimensions[2]  # + 10 if fails
        print(box_z)
        d = box_z/10 / self.nbin  # bin width
        box_h = box_z/10
        print(box_h)
        x = np.linspace(-box_h/2, box_h/2, self.nbin+1)[:-1] + d/2
        density_z_centered = np.zeros(self.nbin)
        density_z_no_center = np.zeros(self.nbin)
        density_lipids_center = np.zeros(self.nbin)
        density_waters_center = np.zeros(self.nbin)

        # Calculte density profiles and FF from individual frames
        start_time = time.time()
        min_z = 10000000
        frame = 0

        elnum_dict = {}

        try:
            ua_flag = self.system['UNITEDATOM_DICT']
        except KeyError:
            ua_flag = False
        else:
            if self.system['UNITEDATOM_DICT'] is None:
                ua_flag = False
            else:
                ua_flag = True

        for key1 in self.system['COMPOSITION'].keys():
            pdbname = self.system['COMPOSITION'][key1]['NAME']
            print(pdbname)
            mdict = self.system.content[key1].mapping_dict
            for univ_aname in mdict:
                aname = mdict[univ_aname]['ATOMNAME']
                try:
                    resname = mdict[univ_aname]['RESIDUE']
                except (KeyError, TypeError):
                    resname = pdbname

                if resname not in elnum_dict.keys():
                    elnum_dict[resname] = {}

                if ua_flag and key1 in lipids_set:
                    ua_lip_json_name = os.path.join(
                        NMLDB_DATA_PATH, 'lipid_json_buildH',
                        self.system['UNITEDATOM_DICT'][key1] + '.json')

                    with open(ua_lip_json_name) as json_file:
                        ua_lip_json = json.load(json_file)

                    h_num = 0
                    try:
                        if ua_lip_json[aname][0] == "CH":
                            h_num = 1
                        if ua_lip_json[aname][0] == "CH2":
                            h_num = 2
                        if ua_lip_json[aname][0] == "CH3":
                            h_num = 3
                    except (KeyError, TypeError):
                        pass

                    elnum_dict[resname][aname] = \
                        FormFactor.get_electrons(univ_aname) + h_num

                else:
                    elnum_dict[resname][aname] = FormFactor.get_electrons(univ_aname)

        print("ElectronNumbers dictionary: ")
        pprint(elnum_dict)

        # define selections and dictionaries for profile calculations
        clipids = self.u.select_atoms(self.lipids)
        cwaters = self.u.select_atoms(self.waters)

        print("Generating final electron arrays from frame #1..", end='')
        weights_all = np.zeros(self.u.atoms.n_atoms)

        for rname, aname2enum in elnum_dict.items():
            for aname, enum in aname2enum.items():
                # all
                csel = self.u.select_atoms(f"resname {rname} and name {aname}")
                weights_all[[_a.index for _a in csel]] = enum

        weights_lipids = weights_all[[_a.index for _a in clipids]]
        weights_waters = weights_all[[_a.index for _a in cwaters]]
        print("done.")

        for ts in tqdm(self.u.trajectory, desc="Iterating over trajectory"):
            # count the index of the frame, numbered from 0, used to be used for the
            # density profile averaging
            # -- posible not needed now
            frame += 1  # ts.frame

            # reads the dimension in z-direction
            box_z = ts.dimensions[2]
            if box_z/10 < min_z:
                min_z = box_z/10

            # reads the coordinates of all of the atoms
            crds = self.u.atoms.positions

            crds_no_center = c.atoms.positions[:, 2]/10

            # calculates the center of mass of the selected atoms that the density
            # should be centered around and takes the z-coordinate value
            ctom = c.atoms.center_of_mass()[2]

            # moves the center of mass of the selected centering group into box/2
            crds[:, 2] += box_z/2 - ctom

            # shifts the coordinates in the universe by the value of
            # the center of mass
            self.u.atoms.positions = crds

            # puts the atoms back to the original box dimension; it possibly does not
            # take PBC into account, therefore it may brake some of the water molecules;
            # -- try it, come to the issue later
            self.u.atoms.pack_into_box()

            # shif the coordinates so that the center in z-dimention is in 0;
            # divide by 10 to get the coordinates in nm, since now the crds are
            # only the z coordinates
            crds = (self.u.atoms.positions[:, 2] - box_z/2)/10
            clipids_center = (clipids.atoms.positions[:, 2] - box_z/2)/10
            cwaters_center = (cwaters.atoms.positions[:, 2] - box_z/2)/10

            # calculates the volume of the bin; d- the "height" of a bin;
            # assumes in [nm]
            # ts.dimension[0], ts.dimension[1] - the x and y dimension;
            # in [A] --> devides by 100
            vbin = d*np.prod(ts.dimensions[:2])/100  # needed for density, correct!

            # calculates the total density profile; keep for now
            density_z_centered += np.histogram(
                crds, bins=self.nbin, range=(-box_h/2, box_h/2),
                weights=weights_all / vbin,
            )[0]
            density_z_no_center += np.histogram(
                crds_no_center, bins=self.nbin, range=(0, box_h),
                weights=weights_lipids / vbin,
            )[0]
            # ELECTRON WEIGHTS
            density_lipids_center += np.histogram(
                clipids_center, bins=self.nbin, range=(-box_h/2, box_h/2),
                weights=weights_lipids / vbin,
            )[0]
            # ELECTRON WEIGHTS
            density_waters_center += np.histogram(
                cwaters_center, bins=self.nbin, range=(-box_h/2, box_h/2),
                weights=weights_waters / vbin,
            )[
                0
            ]

        print("Calculating the density takes {:10.6f} s".format(time.time()-start_time))

        """ Normalizing the profiles """
        density_z_centered /= (frame)
        density_z_no_center /= frame
        density_lipids_center /= frame
        density_waters_center /= frame

        # Post-processign data and writing to file
        density_data = np.vstack((x, density_z_centered)).transpose()
        density_data_no_center = np.vstack((x, density_z_no_center)).transpose()
        density_lipids_center = np.vstack((x, density_lipids_center)).transpose()
        density_waters_center = np.vstack((x, density_waters_center)).transpose()

        # Get the indexes of the final density data where all the time steps contribute
        # In other words, take the coordinates of the smalest box from the simulation
        final_ff_start = int(np.round(self.nbin/2 - min_z/d/2)) + 1
        final_ff_end = int(np.round(self.nbin/2 + min_z/d/2)) - 1

        ff_range = np.linspace(0, 999, 1000)
        fa_aver, fb_aver = self.fourier(
            density_data[final_ff_start:final_ff_end, 1],
            density_data[final_ff_end, 0] - density_data[final_ff_start, 0],
            ff_range,
            density_data[1, 0] - density_data[0, 0],
        )

        # Plot density profiles from the average density with minimal box
        fourrier_result2 = np.sqrt(np.multiply(fa_aver, fa_aver) +
                                   np.multiply(fb_aver, fb_aver))
        fourrier_data2 = np.vstack((ff_range*0.1*0.01, fourrier_result2)).transpose()

        # Save data into files
        # minimum box size density
        with open(str(self.output)+"TotalDensity.dat", 'wb') as f:
            np.savetxt(f, density_data[final_ff_start+1:final_ff_end-1, :],
                       fmt='%8.4f  %.8f')

        with open(str(self.output)+"LipidDensity_no_center.dat", 'wb') as f:
            np.savetxt(f, density_data_no_center[final_ff_start+1:final_ff_end-1, :],
                       fmt='%8.4f  %.8f')

        with open(str(self.output)+"LipidDensity.dat", 'wb') as f:
            np.savetxt(f, density_lipids_center[final_ff_start+1:final_ff_end-1, :],
                       fmt='%8.4f  %.8f')

        with open(str(self.output)+"WaterDensity.dat", 'wb') as f:
            np.savetxt(f, density_waters_center[final_ff_start+1:final_ff_end-1, :],
                       fmt='%8.4f  %.8f')

        # this is the important file form factors
        with open(str(self.output)+"FormFactor.dat", 'wb') as f:
            np.savetxt(f, fourrier_data2, fmt='%8.4f  %.8f')

        # write outputs in JSON

        with open(str(self.output)+"TotalDensity.json", 'w') as f:
            json.dump(density_data[final_ff_start+1:final_ff_end-1, :],
                      f, cls=NumpyArrayEncoder)

        with open(str(self.output)+"LipidDensity.json", 'w') as f:
            json.dump(density_lipids_center[final_ff_start+1:final_ff_end-1, :],
                      f, cls=NumpyArrayEncoder)

        with open(str(self.output)+"WaterDensity.json", 'w') as f:
            json.dump(density_waters_center[final_ff_start+1:final_ff_end-1, :],
                      f, cls=NumpyArrayEncoder)

        with open(str(self.output)+"FormFactor.json", 'w') as f:
            json.dump(fourrier_data2, f, cls=NumpyArrayEncoder)

    # end calculate_density


[docs]
    def fourier(self, ff_density, box_z, ff_range, d_ff):
        """Calculates fourier transform of ff_density in the FF_range.
        It calculates a "height" of a bin for FF puroposes; in this case the number of
        bins is constant and the bin width changes.

        Args:
            ff_density (_type_): TODO: fill
            box_z (_type_): TODO: fill
            ff_range (_type_): TODO: fill
            d_ff (_type_): TODO: fill

        Returns:
            _type_: TODO: fill
        """

        # Creates the direct space coordinates
        # the calculations are stable with rounding (and others) errors in the direct
        # space coordinates
        ff_x = np.linspace(-box_z/2, box_z/2, ff_density.shape[0]+1)[:-1] +\
            box_z/2/ff_density.shape[0]

        k = 0
        bulk = 0
        while k*d_ff < 0.33:
            bulk += ff_density[k] + ff_density[-k-1]
            k += 1
        bulk /= (2*k)

        fa = np.zeros(ff_range.shape[0])
        fb = np.zeros(ff_range.shape[0])

        for j in range(0, ff_density.shape[0]):
            fa += (ff_density[j]-bulk)*np.cos(ff_range*ff_x[j]*0.01)*d_ff
            fb += (ff_density[j]-bulk)*np.sin(ff_range*ff_x[j]*0.01)*d_ff

        return fa, fb