Source code for Scripts.DatabankLib.form_factor
"""
Module form_factor serves for bilayer form factor calculations.
"""
import sys
import os
import re
import MDAnalysis as mda
import numpy as np
import json
import time
from pprint import pprint
from tqdm import tqdm
# for testing purposes to safe time for loading trajectory and creating dictonary
# the electron.dat will be decripted in the future as the values will be used from
# the universal mapping file
import periodictable
from DatabankLib.core import System
from DatabankLib.databankLibrary import lipids_set, getLipids
from DatabankLib.jsonEncoders import CompactJSONEncoder
from DatabankLib import NMLDB_SIMU_PATH, NMLDB_DATA_PATH
# To write data in numpy arrays into json file, we inherit compact JSON
# encoder to make him store 2xN numpy arrays as just a nested list
[docs]
class NumpyArrayEncoder(CompactJSONEncoder):
[docs]
def encode(self, o):
if isinstance(o, np.ndarray):
return CompactJSONEncoder.encode(self, o.tolist())
else:
return CompactJSONEncoder.encode(self, o)
[docs]
class FormFactor:
"""
Calculates form factors from density profiles.
Further development will include thickness of the membrane
from the intersection of lipid density and water density.
Already enables to calculate electrom/mass/number densities
Density could be calculated only from the final form factor profile
- testing is needed to see stability for rare species.
Examination of FF error spikes is needed!
"""
# Group of static private helper methods
@staticmethod
def __list_name_pairs(mapping_dict, molecule):
pairs_dictionary = {}
residues = [mv['RESIDUE']
for _, mv in mapping_dict.items()
if 'RESIDUE' in mv]
if residues:
pairs_dictionary = dict.fromkeys(residues)
print(pairs_dictionary.keys())
for res in pairs_dictionary.keys():
pairs_dictionary[res] = [
[mk, mv['ATOMNAME']]
for mk, mv in mapping_dict.items()
if res == mv['RESIDUE']
]
else:
pairs_dictionary = {
molecule: [[mn, mv['ATOMNAME']] for mn, mv in mapping_dict.items()]
}
return pairs_dictionary
@staticmethod
def __filter_hbonds(mapping_names):
re_h = re.compile(r'M_([A-Z]{1,2}[0-9]{1,4})*H[0-4]{1,2}_M')
filtered = filter(re_h.match, mapping_names)
return filtered
@staticmethod
def __get_water(rmf):
return ('resname ' + rmf['COMPOSITION']['SOL']['NAME'])
# -- regular methods --
def __init__(self, conf, traj, nbin, output,
system: System, density_type="electron"):
self.conf = conf
self.traj = traj
self.system = system
self.__path = os.path.join(NMLDB_SIMU_PATH, system['path'])
start_time = time.time()
try:
self.u = mda.Universe(self.conf, self.traj)
print()
except Exception as e:
if (self.conf[-3:] == 'gro'):
raise e
# assume conf was TPR
gro = self.__path + os.sep + 'conf.gro'
print("Generating conf.gro because MDAnalysis cannot read tpr version")
os.system(f"echo System | gmx trjconv "
f"-s {self.conf} -f {self.traj} -dump 0 -o {gro}")
self.conf = gro
self.u = mda.Universe(self.conf, self.traj)
print("Loading the trajectory takes {:10.6f} s".format(time.time()-start_time))
# number of bins
self.nbin = nbin
# the totatl box size in [nm] - will be probably removed and tpr box size or
# transform of final FF will be used instead
self.output = os.path.join(self.__path, output)
self.lipids = getLipids(self.system)
self.waters = FormFactor.__get_water(self.system)
self.density_type = density_type
self.calculate_weight()
self.calculate_density()
[docs]
@staticmethod
def get_electrons(mapping_name):
# removes numbers and '_M' from the end of string
name1 = re.sub(r"[0-9]*_M$", "", mapping_name)
# removes M_X12Y23... sequence from the beginning
name2 = re.sub(r"^M_([A-Z]{1,2}[0-9]{1,4})*", "", name1)
# name2 is the atom and electrons are assigned to this atom
if name2 == "G": # G is a glycerol carbon so change G to C
name2 = "C"
try:
el = getattr(periodictable, name2).number
except AttributeError:
print(
f"ERROR: This mapping name cannot be read by our rules: {mapping_name}",
file=sys.stderr,
)
print("Consider changing naming in your mapping file.")
sys.exit(1)
# TODO Modify default charge modification
if name2 in ['K', 'Na', 'Cs']:
el -= 1
elif name2 in ['Cl']:
el += 1
#
# TODO REMOVE WEIRD HERITAGE!!!
if name2 == "D":
el = 0
# WEIRD HERITAGE!!!
return el
[docs]
def residue_electrons_all(self, molecule):
"""
Return list of electrons
"""
print(f"Gathering electron list for {molecule}")
electrons = []
# get the name of molecule used in simulation files
molname = self.system['COMPOSITION'][molecule]['NAME']
pairs_residue = FormFactor.__list_name_pairs(
self.system.content[molecule].mapping_dict, molname)
# if lipid is split to multiple residues
selection_txt = ""
try:
selection_txt = (
"moltype " +
self.system['COMPOSITION'][molecule]['MOLTYPE'] +
" and ")
except KeyError:
pass
for res in pairs_residue.keys(): # fix second res in explicit_atoms
# TODO: REPLACE THIS ENTIRE CODE WITH MDANALYSIS
# when lipids are split into more than one residue in topology with
# same residue names e.g. PGR,PA,OL and PE, PA,OL then
# `u.select_atoms("resname " + res)` does not distinguish
# which PA or PL belongs to POPG or POPE
selection_txt = selection_txt + "resname " + res
# explicit atoms of the residue
explicit_atoms = list(self.u.select_atoms(selection_txt).atoms.names)
for atom in explicit_atoms:
# find generic mapping name matching to forcefield atom name
atom_i1 = None
for i in range(len(pairs_residue[res])):
_cm = pairs_residue[res][i] # [UNIQUE_NAME, NAME] pair
if re.match(_cm[1].replace('+', '\\+'), atom):
atom_i1 = i
break
if atom_i1 is None:
raise ValueError(f"Atom was not found: {res}:{atom}")
# # find mapping name
# index_list = [atom in pairs for pairs in pairs_residue[res]]
# print (atom, pairs_residue[res])
# atom_i1 = index_list.index(True)
mapping_name = pairs_residue[res][atom_i1][0]
# get number of electrons in an atom i of residue
e_atom_i = FormFactor.get_electrons(mapping_name)
electrons.append(e_atom_i)
return electrons
[docs]
def assign_electrons(self):
# check if simulation is united atom or all atom
try:
ua_flag = self.system['UNITEDATOM_DICT']
except KeyError:
ua_flag = False
else:
ua_flag = True
weights = []
l_weights = [] # separate list for lipid electrons
w_weights = [] # separate list for water electrons
if ua_flag: # UNITED ATOM SIMULATIONS
for molecule1 in self.system['COMPOSITION'].keys():
print(molecule1)
if molecule1 in lipids_set:
molecule2 = self.system['COMPOSITION'][molecule1]['NAME']
electrons = []
pairs_residue = FormFactor.__list_name_pairs(
self.system.content[molecule2].mapping_dict, molecule2)
h_atoms = FormFactor.__filter_hbonds(
self.system.content[molecule2].mapping_dict.keys())
# list of hydrogen atoms
# extract explicit atoms and get the mapping names
for res in pairs_residue.keys():
# explicit atoms of the residue
explicit_atoms = list(self.u.select_atoms(
"resname " + res).atoms.names)
for atom in explicit_atoms:
# print(atom)
# find generic mapping name matching to forcefield atom name
index_list = [atom in pairs for pairs in pairs_residue[res]]
atom_i1 = index_list.index(True)
mapping_name = pairs_residue[res][atom_i1][0]
# remove _M from the end of mapping name
name1 = re.sub(r'_M', '', mapping_name[::1])
# count how many times name1 occurs in atomsH i.e. how
# many H are bound to it
h_num = sum(
1 for h_atm in h_atoms
if name1 == re.sub(r'H[1-4]_M', '', h_atm[::1])
)
number_e = FormFactor.get_electrons(mapping_name)
e_atom_i = number_e + h_num
electrons.append(e_atom_i)
weights.extend(electrons)
l_weights.extend(electrons)
else:
w = self.residue_electrons_all(molecule1)
weights.extend(w)
if molecule1 == 'SOL':
w_weights.extend(w)
else: # ALL ATOM SIMULATIONS
for molecule in self.system['COMPOSITION'].keys():
w = self.residue_electrons_all(molecule)
weights.extend(w)
if molecule in lipids_set:
l_weights.extend(w)
elif molecule == 'SOL':
w_weights.extend(w)
# how to do lipids split into three residues?? MAYBE WORKS
return weights, l_weights, w_weights
[docs]
def calculate_weight(self):
"""
Creates an array of weights for all atoms in the simulation.
For electron densities:
- creates dictonary of atom types and number of electrons
loaded form an external file --> in the future it backmaps
the atom names to mapping files
and automaticaly assigns # of electrons
Number densities:
- all weights are 1
Mass densities:
- reads masses from u.atoms.masses
"""
start_time = time.time()
if self.density_type == "electron":
# calc weights to calculate the electron density
# with the function that maps the electrons to atoms
self.wght, self.lipid_wght, self.water_wght = self.assign_electrons()
print("lenght of wght: " + str(len(self.wght)))
print("lenght of lipid_wght: " + str(len(self.lipid_wght)))
print("atoms in system: " + str(len(self.u.atoms.names)))
if len(self.wght) != len(self.u.atoms.names):
print("ERROR: Number of atoms mismatch")
print("ERROR: If lipids are split to many residues add moltype "
"of lipids from tpr as 'MOLTYPE' to README.yaml "
"under 'COMPOSITION'.")
if self.density_type == "number":
self.wght = np.ones(self.u.atoms.names.shape[0])
clipids = self.u.select_atoms(self.lipids)
self.lipid_wght = np.ones(clipids.atoms.names.shape[0])
cwaters = self.u.select_atoms(self.waters)
self.lipid_wght = np.ones(cwaters.atoms.names.shape[0])
if self.density_type == "mass":
self.wght = self.u.atoms.masses
clipids = self.u.select_atoms(self.lipids)
self.lipid_wght = np.ones(clipids.atoms.names.shape[0])
cwaters = self.u.select_atoms(self.waters)
self.lipid_wght = np.ones(cwaters.atoms.names.shape[0])
print("Creating the electron mapping dictonary takes {:10.6f} s"
.format(time.time()-start_time))
[docs]
def calculate_density(self):
c = self.u.select_atoms(self.lipids)
print(c) # TODO: remove excessive debug printing!
box_z = self.u.dimensions[2] # + 10 if fails
print(box_z)
d = box_z/10 / self.nbin # bin width
box_h = box_z/10
print(box_h)
x = np.linspace(-box_h/2, box_h/2, self.nbin+1)[:-1] + d/2
density_z_centered = np.zeros(self.nbin)
density_z_no_center = np.zeros(self.nbin)
density_lipids_center = np.zeros(self.nbin)
density_waters_center = np.zeros(self.nbin)
# Calculte density profiles and FF from individual frames
start_time = time.time()
min_z = 10000000
frame = 0
elnum_dict = {}
try:
ua_flag = self.system['UNITEDATOM_DICT']
except KeyError:
ua_flag = False
else:
if self.system['UNITEDATOM_DICT'] is None:
ua_flag = False
else:
ua_flag = True
for key1 in self.system['COMPOSITION'].keys():
pdbname = self.system['COMPOSITION'][key1]['NAME']
print(pdbname)
mdict = self.system.content[key1].mapping_dict
for univ_aname in mdict:
aname = mdict[univ_aname]['ATOMNAME']
try:
resname = mdict[univ_aname]['RESIDUE']
except (KeyError, TypeError):
resname = pdbname
if resname not in elnum_dict.keys():
elnum_dict[resname] = {}
if ua_flag and key1 in lipids_set:
ua_lip_json_name = os.path.join(
NMLDB_DATA_PATH, 'lipid_json_buildH',
self.system['UNITEDATOM_DICT'][key1] + '.json')
with open(ua_lip_json_name) as json_file:
ua_lip_json = json.load(json_file)
h_num = 0
try:
if ua_lip_json[aname][0] == "CH":
h_num = 1
if ua_lip_json[aname][0] == "CH2":
h_num = 2
if ua_lip_json[aname][0] == "CH3":
h_num = 3
except (KeyError, TypeError):
pass
elnum_dict[resname][aname] = \
FormFactor.get_electrons(univ_aname) + h_num
else:
elnum_dict[resname][aname] = FormFactor.get_electrons(univ_aname)
print("ElectronNumbers dictionary: ")
pprint(elnum_dict)
# define selections and dictionaries for profile calculations
clipids = self.u.select_atoms(self.lipids)
cwaters = self.u.select_atoms(self.waters)
print("Generating final electron arrays from frame #1..", end='')
weights_all = np.zeros(self.u.atoms.n_atoms)
for rname, aname2enum in elnum_dict.items():
for aname, enum in aname2enum.items():
# all
csel = self.u.select_atoms(f"resname {rname} and name {aname}")
weights_all[[_a.index for _a in csel]] = enum
weights_lipids = weights_all[[_a.index for _a in clipids]]
weights_waters = weights_all[[_a.index for _a in cwaters]]
print("done.")
for ts in tqdm(self.u.trajectory, desc="Iterating over trajectory"):
# count the index of the frame, numbered from 0, used to be used for the
# density profile averaging
# -- posible not needed now
frame += 1 # ts.frame
# reads the dimension in z-direction
box_z = ts.dimensions[2]
if box_z/10 < min_z:
min_z = box_z/10
# reads the coordinates of all of the atoms
crds = self.u.atoms.positions
crds_no_center = c.atoms.positions[:, 2]/10
# calculates the center of mass of the selected atoms that the density
# should be centered around and takes the z-coordinate value
ctom = c.atoms.center_of_mass()[2]
# moves the center of mass of the selected centering group into box/2
crds[:, 2] += box_z/2 - ctom
# shifts the coordinates in the universe by the value of
# the center of mass
self.u.atoms.positions = crds
# puts the atoms back to the original box dimension; it possibly does not
# take PBC into account, therefore it may brake some of the water molecules;
# -- try it, come to the issue later
self.u.atoms.pack_into_box()
# shif the coordinates so that the center in z-dimention is in 0;
# divide by 10 to get the coordinates in nm, since now the crds are
# only the z coordinates
crds = (self.u.atoms.positions[:, 2] - box_z/2)/10
clipids_center = (clipids.atoms.positions[:, 2] - box_z/2)/10
cwaters_center = (cwaters.atoms.positions[:, 2] - box_z/2)/10
# calculates the volume of the bin; d- the "height" of a bin;
# assumes in [nm]
# ts.dimension[0], ts.dimension[1] - the x and y dimension;
# in [A] --> devides by 100
vbin = d*np.prod(ts.dimensions[:2])/100 # needed for density, correct!
# calculates the total density profile; keep for now
density_z_centered += np.histogram(
crds, bins=self.nbin, range=(-box_h/2, box_h/2),
weights=weights_all / vbin,
)[0]
density_z_no_center += np.histogram(
crds_no_center, bins=self.nbin, range=(0, box_h),
weights=weights_lipids / vbin,
)[0]
# ELECTRON WEIGHTS
density_lipids_center += np.histogram(
clipids_center, bins=self.nbin, range=(-box_h/2, box_h/2),
weights=weights_lipids / vbin,
)[0]
# ELECTRON WEIGHTS
density_waters_center += np.histogram(
cwaters_center, bins=self.nbin, range=(-box_h/2, box_h/2),
weights=weights_waters / vbin,
)[
0
]
print("Calculating the density takes {:10.6f} s".format(time.time()-start_time))
""" Normalizing the profiles """
density_z_centered /= (frame)
density_z_no_center /= frame
density_lipids_center /= frame
density_waters_center /= frame
# Post-processign data and writing to file
density_data = np.vstack((x, density_z_centered)).transpose()
density_data_no_center = np.vstack((x, density_z_no_center)).transpose()
density_lipids_center = np.vstack((x, density_lipids_center)).transpose()
density_waters_center = np.vstack((x, density_waters_center)).transpose()
# Get the indexes of the final density data where all the time steps contribute
# In other words, take the coordinates of the smalest box from the simulation
final_ff_start = int(np.round(self.nbin/2 - min_z/d/2)) + 1
final_ff_end = int(np.round(self.nbin/2 + min_z/d/2)) - 1
ff_range = np.linspace(0, 999, 1000)
fa_aver, fb_aver = self.fourier(
density_data[final_ff_start:final_ff_end, 1],
density_data[final_ff_end, 0] - density_data[final_ff_start, 0],
ff_range,
density_data[1, 0] - density_data[0, 0],
)
# Plot density profiles from the average density with minimal box
fourrier_result2 = np.sqrt(np.multiply(fa_aver, fa_aver) +
np.multiply(fb_aver, fb_aver))
fourrier_data2 = np.vstack((ff_range*0.1*0.01, fourrier_result2)).transpose()
# Save data into files
# minimum box size density
with open(str(self.output)+"TotalDensity.dat", 'wb') as f:
np.savetxt(f, density_data[final_ff_start+1:final_ff_end-1, :],
fmt='%8.4f %.8f')
with open(str(self.output)+"LipidDensity_no_center.dat", 'wb') as f:
np.savetxt(f, density_data_no_center[final_ff_start+1:final_ff_end-1, :],
fmt='%8.4f %.8f')
with open(str(self.output)+"LipidDensity.dat", 'wb') as f:
np.savetxt(f, density_lipids_center[final_ff_start+1:final_ff_end-1, :],
fmt='%8.4f %.8f')
with open(str(self.output)+"WaterDensity.dat", 'wb') as f:
np.savetxt(f, density_waters_center[final_ff_start+1:final_ff_end-1, :],
fmt='%8.4f %.8f')
# this is the important file form factors
with open(str(self.output)+"FormFactor.dat", 'wb') as f:
np.savetxt(f, fourrier_data2, fmt='%8.4f %.8f')
# write outputs in JSON
with open(str(self.output)+"TotalDensity.json", 'w') as f:
json.dump(density_data[final_ff_start+1:final_ff_end-1, :],
f, cls=NumpyArrayEncoder)
with open(str(self.output)+"LipidDensity.json", 'w') as f:
json.dump(density_lipids_center[final_ff_start+1:final_ff_end-1, :],
f, cls=NumpyArrayEncoder)
with open(str(self.output)+"WaterDensity.json", 'w') as f:
json.dump(density_waters_center[final_ff_start+1:final_ff_end-1, :],
f, cls=NumpyArrayEncoder)
with open(str(self.output)+"FormFactor.json", 'w') as f:
json.dump(fourrier_data2, f, cls=NumpyArrayEncoder)
# end calculate_density
[docs]
def fourier(self, ff_density, box_z, ff_range, d_ff):
"""Calculates fourier transform of ff_density in the FF_range.
It calculates a "height" of a bin for FF puroposes; in this case the number of
bins is constant and the bin width changes.
Args:
ff_density (_type_): TODO: fill
box_z (_type_): TODO: fill
ff_range (_type_): TODO: fill
d_ff (_type_): TODO: fill
Returns:
_type_: TODO: fill
"""
# Creates the direct space coordinates
# the calculations are stable with rounding (and others) errors in the direct
# space coordinates
ff_x = np.linspace(-box_z/2, box_z/2, ff_density.shape[0]+1)[:-1] +\
box_z/2/ff_density.shape[0]
k = 0
bulk = 0
while k*d_ff < 0.33:
bulk += ff_density[k] + ff_density[-k-1]
k += 1
bulk /= (2*k)
fa = np.zeros(ff_range.shape[0])
fb = np.zeros(ff_range.shape[0])
for j in range(0, ff_density.shape[0]):
fa += (ff_density[j]-bulk)*np.cos(ff_range*ff_x[j]*0.01)*d_ff
fb += (ff_density[j]-bulk)*np.sin(ff_range*ff_x[j]*0.01)*d_ff
return fa, fb