Source code for mdpow.workflows.base

# MDPOW: base.py
# 2022 Cade Duckworth

"""
:mod:`mdpow.workflows.base` --- Automated workflow base functions
=================================================================

To analyze multiple MDPOW projects, provide :func:`project_paths`
with the top-level directory containing all MDPOW projects' simulation data
to obtain a :class:`pandas.DataFrame` containing the project information
and paths. Then, :func:`automated_project_analysis` takes as input the
aforementioned :class:`pandas.DataFrame` and runs the specified
:class:`~mdpow.analysis.ensemble.EnsembleAnalysis` for all MDPOW projects
under the top-level directory provided to :func:`project_paths`.

.. seealso:: :mod:`~mdpow.workflows.registry`

.. autofunction:: project_paths
.. autofunction:: automated_project_analysis
.. autofunction:: guess_elements

"""

import os
import re
import logging

import numpy as np
import pandas as pd
from MDAnalysis.topology import guessers, tables

logger = logging.getLogger("mdpow.workflows.base")


[docs]def project_paths(parent_directory=None, csv=None, csv_save_dir=None): """Takes a top directory containing MDPOW projects and determines the molname, resname, and path, of each MDPOW project within. Optionally takes a .csv file containing `molname`, `resname`, and `paths`, in that order. :keywords: *parent_directory* the path for the location of the top directory under which the subdirectories of MDPOW simulation data exist, additionally creates a 'project_paths.csv' file for user manipulation of metadata and for future reference *csv* .csv file containing the molecule names, resnames, and paths, in that order, for the MDPOW simulation data to be iterated over must contain header of the form: `molecule,resname,path` *csv_save_dir* optionally provided directory to save .csv file, otherwise, data will be saved in current working directory :returns: *project_paths* :class:`pandas.DataFrame` containing MDPOW project metadata .. rubric:: Example Typical Workflow:: project_paths = project_paths(parent_directory='/foo/bar/MDPOW_projects') automated_project_analysis(project_paths) or:: project_paths = project_paths(csv='/foo/bar/MDPOW.csv') automated_project_analysis(project_paths) """ if parent_directory is not None: locations = [] reg_compile = re.compile("FEP") for dirpath, dirnames, filenames in os.walk(parent_directory): result = [ dirpath.strip() for dirname in dirnames if reg_compile.match(dirname) ] if result: locations.append(result[0]) resnames = [] for loc in locations: res_temp = loc.strip().split("/") resnames.append(res_temp[-1]) project_paths = ( pd.DataFrame({"molecule": resnames, "resname": resnames, "path": locations}) .sort_values(by=["molecule", "resname", "path"]) .reset_index(drop=True) ) if csv_save_dir is not None: project_paths.to_csv(f"{csv_save_dir}/project_paths.csv", index=False) logger.info(f"project_paths saved under {csv_save_dir}") else: current_directory = os.getcwd() project_paths.to_csv("project_paths.csv", index=False) logger.info(f"project_paths saved under {current_directory}") elif csv is not None: locations = pd.read_csv(csv) project_paths = locations.sort_values( by=["molecule", "resname", "path"] ).reset_index(drop=True) return project_paths
[docs]def automated_project_analysis(project_paths, ensemble_analysis, **kwargs): """Takes a :class:`pandas.DataFrame` created by :func:`~mdpow.workflows.base.project_paths` and iteratively runs the specified :class:`~mdpow.analysis.ensemble.EnsembleAnalysis` for each of the projects by running the associated automated workflow in each project directory returned by :func:`~mdpow.workflows.base.project_paths`. Compatibility with more automated analyses in development. :keywords: *project_paths* :class:`pandas.DataFrame` that provides paths to MDPOW projects *ensemble_analysis* name of the :class:`~mdpow.analysis.ensemble.EnsembleAnalysis` that corresponds to the desired automated workflow module *kwargs* keyword arguments for the supported automated workflows, see the :mod:`~mdpow.workflows.registry` for all available workflows and their call signatures .. rubric:: Example A typical workflow is the automated dihedral analysis from :mod:`mdpow.workflows.dihedrals`, which applies the *ensemble analysis* :class:`~mdpow.analysis.dihedral.DihedralAnalysis` to each project. The :data:`~mdpow.workflows.registry.registry` contains this automated workflow under the key *"DihedralAnalysis"* and so the automated execution for all `project_paths` (obtained via :func:`project_paths`) is performed by passing the specific key to :func:`automated_project_analysis`:: project_paths = project_paths(parent_directory='/foo/bar/MDPOW_projects') automated_project_analysis(project_paths, ensemble_analysis='DihedralAnalysis', **kwargs) """ # import inside function to avoid circular imports from .registry import registry for row in project_paths.itertuples(): molname = row.molecule resname = row.resname dirname = row.path logger.info(f"starting {molname}") try: registry[ensemble_analysis]( dirname=dirname, resname=resname, molname=molname, **kwargs ) logger.info(f"{molname} completed") except KeyError as err: msg = ( f"Invalid ensemble_analysis {err}. An EnsembleAnalysis type that corresponds " "to an existing automated workflow module must be input as a kwarg. " "ex: ensemble_analysis='DihedralAnalysis'" ) logger.error(f"{err} is an invalid selection") raise KeyError(msg) except TypeError as err: msg = ( f"Invalid ensemble_analysis {ensemble_analysis}. An EnsembleAnalysis type that " "corresponds to an existing automated workflow module must be input as a kwarg. " "ex: ensemble_analysis='DihedralAnalysis'" ) logger.error(f"workflow module for {ensemble_analysis} does not exist yet") raise TypeError(msg) logger.info("all analyses completed") return
[docs]def guess_elements(atoms, rtol=1e-3): """guess elements for atoms from masses Given masses, we perform a reverse lookup on :data:`MDAnalysis.topology.tables.masses` to find the corresponding element. Only atoms where the standard MDAnalysis guesser finds elements with masses contradicting the topology masses are corrected. .. Note:: This function *requires* correct masses to be present. No sanity checks because MDPOW always uses TPR files that contain correct masses. :arguments: *atoms* MDAnalysis AtomGroup *with masses defined* :keywords: *rtol* relative tolerance for a match (as used in :func:`numpy.isclose`); atol=1e-6 is at a fixed value, which means that "zero" is only recognized for values =< 1e-6 .. note:: In order to reliably match GROMACS masses, *rtol* should be at least 1e-3. :returns: *elements* array of guessed element symbols, in same order as `atoms` .. rubric:: Example As an example we guess masses and then set the elements for all atoms:: elements = guess_elements(atoms) atoms.add_TopologyAttr("elements", elements) """ ATOL = 1e-6 names = atoms.names masses = atoms.masses mda_elements = np.fromiter(tables.masses.keys(), dtype="U5") mda_masses = np.fromiter(tables.masses.values(), dtype=np.float64) guessed_elements = guessers.guess_types(names) guessed_masses = np.array([guessers.get_atom_mass(n) for n in guessed_elements]) problems = np.logical_not(np.isclose(masses, guessed_masses, atol=ATOL, rtol=rtol)) # match only problematic masses against the MDA reference masses iproblem, ielem = np.nonzero( np.isclose(masses[problems, np.newaxis], mda_masses, atol=ATOL, rtol=rtol) ) # We should normally find a match for each problem but just in case, assert and # give some useful information for debugging. assert len(ielem) == sum(problems), ( "Not all masses could be assigned an element, " f"missing names {set(names[problems]) - set(names[problems][iproblem])}" ) guessed_elements[problems] = mda_elements[ielem] # manually fix some dummies that are labelled "D": set ALL zero masses to DUMMY guessed_elements[np.isclose(masses, 0, atol=ATOL)] = "DUMMY" return guessed_elements