Source code for mdpow.fep

# -*- Mode: python; tab-width: 4; indent-tabs-mode:nil; -*-
# vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4
# mdpow: fep.py
# Copyright (c) 2010 Oliver Beckstein

r"""
:mod:`mdpow.fep` -- Calculate free energy of solvation
======================================================

Set up and run free energy perturbation (FEP) calculations to
calculate the free energy of hydration of a solute in a solvent
box. The protocol follows the works of D. Mobley (`Free Energy
Tutorial`_) and M. Shirts, and uses Gromacs 4.0.x.

Required Input:

 - topology
 - equilibrated structure of the solvated molecule

See the docs for :class:`Gsolv` for details on what is calculated.


.. _Free Energy Tutorial:
   http://www.dillgroup.ucsf.edu/group/wiki/index.php?title=Free_Energy:_Tutorial

Differences to published protocols
----------------------------------

Some notes on how the approach encoded here differs from what others
(notabley Mobley) did:

- We use Gromacs 4.x and use the new decoupling feature ::

    couple-intramol = no

  which indicates that "intra-molecular interactions remain". It should (as I
  understand it) allow us to only calculate the free energy changes in solution
  so that we do not have to do an extra calculation in vacuo.
  http://www.mail-archive.com/gmx-users@gromacs.org/msg18803.html

  Mobley does an extra discharging calculation in vacuo and calculates

  .. math::

      \Delta A = \Delta A_{\mathrm{coul}}(\mathrm{vac}) - (\Delta A_{\mathrm{coul}}(\mathrm{sol}) + \Delta A_{\mathrm{vdw}}(\mathrm{sol}))

  (but also annihilates the interactions on the solute, corresponding to
  ``couple-intramol = yes``) whereas we do

  .. math::

      \Delta A = - (\Delta A_{\mathrm{coul}}(\mathrm{sol}) + \Delta A_{\mathrm{vdw}}(\mathrm{sol}))

- simulations are run as NPT (but make sure to use Langevin dynamics
  for temperature control)


Example
-------

see :mod:`mdpow`


User reference
--------------

Simulation setup and analysis of all FEP simulations is encapsulated
by a :class:`mdpow.fep.Gsolv` object. For the hydration free energy
there is a special class :class:`~mdpow.fep.Ghyd` and for the
solvation free energy in octanol there is
:class:`~mdpow.fep.Goct`. See the description of
:class:`~mdpow.fep.Gsolv` for methods common to both.

.. autoclass:: Gsolv
   :members:
   :inherited-members:

.. autoclass:: Ghyd
   :members:
   :inherited-members:

.. autoclass:: Goct
   :members:
   :inherited-members:

.. autoclass:: Gcyclo
   :members:
   :inherited-members:

.. autoclass:: Gtol
   :members:
   :inherited-members:

.. autofunction:: pOW

.. autofunction:: pCW

.. autofunction:: pTW

Developer notes
---------------

A user really only needs to access classes derived from
:class:`mdpow.fep.Gsolv`; all other classes and functions are auxiliary and
only of interest to developers.

Additional objects that support :class:`mdpow.fep.Gsolv`.

.. autoclass:: FEPschedule
   :members:
.. autofunction:: molar_to_nm3
.. autofunction:: kcal_to_kJ
.. autofunction:: kJ_to_kcal

.. data:: N_AVOGADRO

          Avogadro's constant |NA| in mol\ :sup:`-1` (`NA NIST value`_).

.. data:: kBOLTZ

          Boltzmann's constant |kB| in kJ mol\ :sup:`-1` (`kB NIST value`_).

.. |NA| replace:: *N*\ :sub:`A`
.. |kB| replace:: *k*\ :sub:`B`
.. _NA NIST value: http://physics.nist.gov/cgi-bin/cuu/Value?na
.. _kB NIST value: http://physics.nist.gov/cgi-bin/cuu/Value?k
.. |^-1| replace:: \ :sup:`-1`
.. |^-3| replace:: \ :sup:`-3`
.. |^3|  replace:: \ :sup:`3`

TODO
~~~~

- run minimization, NVT-equil, NPT-equil prior to production (probably
  use preprocessed topology from ``grompp -pp`` for portability)

  See `Free Energy Tutorial`_.

"""

import os
import errno
import copy
from subprocess import call
import warnings
from glob import glob
from configparser import NoOptionError

import numpy as np
import pandas as pd

import scipy.integrate
from scipy import constants

import numkit.integration
import numkit.timeseries
from numkit.observables import QuantityWithError

from alchemlyb.parsing.gmx import extract_dHdl, extract_u_nk
from alchemlyb.estimators import TI, BAR, MBAR
from alchemlyb.parsing.gmx import _extract_dataframe
from alchemlyb.preprocessing.subsampling import statistical_inefficiency

import gromacs
import gromacs.utilities

try:
    import gromacs.setup
except (ImportError, OSError):
    raise ImportError("Gromacs installation not found, source GMXRC?")
from gromacs.utilities import asiterable, AttributeDict, in_dir, openany

import logging

logger = logging.getLogger("mdpow.fep")

from . import config
from .restart import Journalled
from . import kBOLTZ, N_AVOGADRO


[docs]def molar_to_nm3(c):
    """Convert a concentration in Molar to nm|^-3|."""
    return c * N_AVOGADRO * 1e-24


def bar_to_kJmolnm3(p):
    """Convert pressure in bar to kJ mol|^-1| nm|^-3|.

    1 bar = 1e5 J m|^-3|
    """
    return p * N_AVOGADRO * 1e-25


[docs]def kcal_to_kJ(x):
    """Convert a energy in kcal to kJ."""
    return 4.184 * x


[docs]def kJ_to_kcal(x):
    """Convert a energy in kJ to kcal."""
    return x / 4.184


def kBT_to_kJ(x, T):
    """Convert a energy in kBT to kJ/mol."""
    return x * constants.N_A * constants.k * T * 1e-3


[docs]class FEPschedule(AttributeDict):
    """Describe mdp parameter choices as key - value pairs.

    The FEP schedule can be loaded from a configuration parser with
    the static method :meth:`FEPschedule.load`.

    See the example runinput file for an example. It contains the sections::

       [FEP_schedule_Coulomb]
       name = Coulomb
       description = dis-charging vdw+q --> vdw
       label = Coul
       couple_lambda0 = vdw-q
       couple_lambda1 = vdw
       # soft core alpha: linear scaling for coulomb
       sc_alpha = 0
       lambdas = 0, 0.25, 0.5, 0.75, 1.0


       [FEP_schedule_VDW]
       name = vdw
       description = decoupling vdw --> none
       label = VDW
       couple_lambda0 = vdw
       couple_lambda1 = none
       # recommended values for soft cores (Mobley, Shirts et al)
       sc_alpha = 0.5
       sc_power = 1
       sc_sigma = 0.3
       lambdas = 0.0, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 1

    """

    mdp_keywords = dict(
        (
            ("sc_alpha", float),
            ("sc_power", int),
            ("sc_sigma", float),
            ("couple_lambda0", str),
            ("couple_lambda1", str),
        )
    )
    meta_keywords = dict((("name", str), ("description", str), ("label", str)))
    other_keywords = dict((("lambdas", list),))

    @property
    def mdp_dict(self):
        """Dict of key-values that can be set in a mdp file."""
        return dict(((k, v) for k, v in self.items() if k in self.mdp_keywords))

[docs]    @staticmethod
    def load(cfg, section):
        """Initialize a :class:`FEPschedule` from the *section* in the configuration *cfg*"""
        keys = {}
        keys.update(FEPschedule.mdp_keywords)
        keys.update(FEPschedule.meta_keywords)
        keys.update(FEPschedule.other_keywords)

        cfg_get = {
            float: cfg.getfloat,
            int: cfg.getint,
            str: cfg.getstr,  # literal strings, no conversion of None (which we need for the MDP!)
            # CHECK: THIS IS LIKELY NOT GUARANTEED ANYMORE since getstr == get
            list: cfg.getarray,  # numpy float array from list
        }

        def getter(type, section, key):
            value = cfg_get[type](section, key)
            try:
                # for empty list [] and array array([])
                if len(value) == 0:
                    value = None
            except TypeError:
                pass
            return value

        # skip any None values
        return FEPschedule(
            (key, getter(keytype, section, key))
            for key, keytype in keys.items()
            if getter(keytype, section, key) is not None
        )

    def __deepcopy__(self, memo):
        x = FEPschedule()
        for k, v in self.items():
            x[k] = copy.deepcopy(v)
        return x


[docs]class Gsolv(Journalled):
    r"""Simulations to calculate and analyze the solvation free energy.

    :math:`\Delta A` is computed from the decharging and the
    decoupling step. With our choice of ``lambda=0`` being the fully
    interacting and ``lambda=1`` the non-interacting state, it is computed
    as

    .. math::

       \Delta A = -(\Delta A_{\mathrm{coul}} + \Delta A_{\mathrm{vdw}})

    With this protocol, the concentration in the liquid and in the gas
    phase is the same. (Under the assumption of ideal solution/ideal
    gas behaviour this directly relates to the Ben-Naim 1M/1M standard
    state.)

    (We neglect the negligible correction :math:`-kT \ln
    V_x/V_{\mathrm{sim}} = -kT \ln(1 - v_s/V_{\mathrm{sim}})` where
    :math:`V_x` is the volume of the system without the solute but the
    same number of water molecules as in the fully interacting case
    [see Michael Shirts' Thesis, p82].)


    Typical work flow::

       G = Gsolv(simulation='drug.simulation')           # continue from :mod:`mdpow.equil`
       G.setup(qscript=['my_template.sge', 'local.sh'])  # my_template.sge is user supplied
       G.qsub()    # run SGE job arrays as generated from my_template.sge
       G.analyze()
       G.plot()

    See :mod:`gromacs.qsub` for notes on how to write templates for
    queuing system scripts (in particular `queuing system templates`_).

    .. _queuing system templates:
       http://gromacswrapper.readthedocs.io/en/latest/gromacs/blocks/qsub.html#queuing-system-templates
    """

    topdir_default = "FEP"
    dirname_default = os.path.curdir
    solvent_default = "water"
    #: Check list of all methods that can be run as an independent protocol; see also
    #: :meth:`Simulation.get_protocol` and :class:`restart.Journal`
    protocols = ["setup", "fep_run"]

    #: Estimators in alchemlyb
    estimators = {
        "TI": {"extract": extract_dHdl, "estimator": TI},
        "BAR": {"extract": extract_u_nk, "estimator": BAR},
        "MBAR": {"extract": extract_u_nk, "estimator": MBAR},
    }

    # TODO: initialize from default cfg
    schedules_default = {
        "coulomb": FEPschedule(
            name="coulomb",
            description="dis-charging vdw+q --> vdw",
            label="Coul",
            couple_lambda0="vdw-q",
            couple_lambda1="vdw",
            sc_alpha=0,  # linear scaling for coulomb
            lambdas=np.array([0.0, 0.25, 0.5, 0.75, 1.0]),  # default values
        ),
        "vdw": FEPschedule(
            name="vdw",
            description="decoupling vdw --> none",
            label="VDW",
            couple_lambda0="vdw",
            couple_lambda1="none",
            sc_alpha=0.5,
            sc_power=1,
            sc_sigma=0.3,  # recommended values
            lambdas=np.array(
                [
                    0.0,
                    0.05,
                    0.1,
                    0.2,
                    0.3,
                    0.4,
                    0.5,
                    0.6,
                    0.65,
                    0.7,
                    0.75,
                    0.8,
                    0.85,
                    0.9,
                    0.95,
                    1,
                ]
            ),  # defaults
        ),
    }

    #: Default Gromacs *MDP* run parameter file for FEP.
    #: (The file is part of the package and is found with :func:`mdpow.config.get_template`.)
    mdp_default = "bar_opls.mdp"

    def __init__(self, molecule=None, top=None, struct=None, method="BAR", **kwargs):
        """Set up Gsolv from input files or a equilibrium simulation.

        :Arguments:
           *molecule*
               name of the molecule for which the hydration free
               energy is to be computed (as in the gromacs topology)
               [REQUIRED]
           *top*
               topology [REQUIRED]
           *struct*
               solvated and equilibrated input structure [REQUIRED]
           *ndx*
               index file
           *dirname*
               directory to work under ['FEP/*solvent*']
           *solvent*
               name of the solvent (only used for path names); will not affect
               the simulations because the input coordinates and topologies are
               determined by either *simulation* or *molecule*, *top*, and
               *struct*. If *solvent* is not provided then it is set to
               :attr:`solvent_default`.
           *lambda_coulomb*
               list of lambdas for discharging: q+vdw --> vdw
           *lambda_vdw*
               list of lambdas for decoupling: vdw --> none
           *runtime*
               simulation time per window in ps [5000]
           *temperature*
               temperature in Kelvin of the simulation [300.0]
           *qscript*
               template or list of templates for queuing system scripts
               (see :data:`gromacs.config.templates` for details) [local.sh]
           *includes*
               include directories
           *simulation*
               Instead of providing the required arguments, obtain the input
               files from a :class:`mdpow.equil.Simulation` instance.
           *method*
               "TI" for thermodynamic integration or "BAR" for Bennett acceptance
               ratio; using "BAR" in Gromacs also writes TI data so this is the
               default.
           *mdp*
               MDP file name (if no entry then the package defaults are used)
           *filename*
               Instead of providing the required arguments, load from pickle
               file. If either *simulation* or *molecule*, *top*, and *struct*
               are provided then simply set the attribute :attr:`filename` of
               the default checkpoint file to *filename*.
           *basedir*
               Prepend *basedir* to all filenames; ``None`` disables [.]
           *permissive*
               Set to ``True`` if you want to read past corrupt data in output
               xvg files (see :class:`gromacs.formats.XVG` for details); note that
               *permissive*=``True`` can lead to **wrong results**. Overrides
               the value set in a loaded pickle file. [``False``]
           *stride*
               collect every *stride* data line, see :meth:`Gsolv.collect` [1]
           *start*
               Start frame of data analyzed in every fep window.
           *stop*
               Stop frame of data analyzed in every fep window.
           *SI*
               Set to ``True`` if you want to perform statistical inefficiency
               to preprocess the data.
           *kwargs*
               other undocumented arguments (see source for the moment)

        .. Note::

           Only a subset of the FEP parameters can currently be set with
           keywords (lambdas); other parameters such as the soft cores are
           currently fixed. For advanced use: pass a dict with keys "coulomb"
           and "vdw" and values of :class:`FEPschedule` in the keyword
           *schedules*. The *lambdas* will override these settings (which
           typically come from a cfg). This method allows setting all
           parameters.

        """
        required_args = ("molecule", "top", "struct")

        # should this be below somewhere?
        if not method in ("TI", "BAR", "MBAR"):
            raise ValueError("method can only be TI, BAR, or MBAR")
        self.method = method

        filename = kwargs.pop("filename", None)
        basedir = kwargs.pop(
            "basedir", os.path.curdir
        )  # all other dirs relative to basedir
        simulation = kwargs.pop("simulation", None)
        solvent = kwargs.pop("solvent", self.solvent_default)
        if (
            None in (molecule, top, struct) and simulation is None
        ) and filename is not None:
            # load from pickle file
            self.load(filename)
            self.filename = filename
            kwargs = {}  # for super
        else:
            if simulation is not None:
                # load data from Simulation instance
                self.molecule = simulation.molecule
                self.top = (
                    simulation.files.processed_topology or simulation.files.topology
                )
                self.struct = simulation.files.MD_NPT
                self.ndx = simulation.files.ndx
                if simulation.solvent_type == solvent:
                    self.solvent_type = simulation.solvent_type
                else:
                    errmsg = (
                        "Solvent mismatch: simulation was run for %s but Gsolv is set up for %s"
                        % (simulation.solvent_type, solvent)
                    )
                    logger.error(errmsg)
                    raise ValueError(errmsg)
            else:
                self.molecule = molecule  # should check that this is in top (?)
                self.top = top
                self.struct = struct
                self.ndx = kwargs.pop("ndx", None)
                self.solvent_type = solvent

            for attr in required_args:
                if self.__getattribute__(attr) is None:
                    raise ValueError("A value is required for %(attr)r." % vars())

            # fix struct (issue with qscripts being independent from rest of code)
            if not os.path.exists(self.struct):
                # struct is not reliable as it depends on qscript so now we just try everything...
                struct = gromacs.utilities.find_first(
                    self.struct, suffices=["pdb", "gro"]
                )
                if struct is None:
                    logger.error("Starting structure %r does not exist.", self.struct)
                    raise IOError(
                        errno.ENOENT, "Starting structure not found", self.struct
                    )
                else:
                    logger.info(
                        "Found starting structure %r (instead of %r).",
                        struct,
                        self.struct,
                    )
                    self.struct = struct

            self.Temperature = kwargs.pop("temperature", 300.0)
            self.qscript = kwargs.pop("qscript", ["local.sh"])
            self.deffnm = kwargs.pop("deffnm", "md")

            self.mdp = kwargs.pop("mdp", config.get_template(self.mdp_default))

            # schedules (deepcopy because we might modify)
            # For some reason 2.7 tests failed with deepcopy in 2.7 so used merge_dict instead
            self.schedules = config.merge_dicts(self.schedules_default, {})
            schedules = kwargs.pop("schedules", {})
            self.schedules.update(schedules)
            self.lambdas = {
                "coulomb": kwargs.pop(
                    "lambda_coulomb", self.schedules["coulomb"].lambdas
                ),
                "vdw": kwargs.pop("lambda_vdw", self.schedules["vdw"].lambdas),
            }
            self.runtime = kwargs.pop("runtime", 5000.0)  # ps
            self.dirname = kwargs.pop("dirname", self.dirname_default)
            self.includes = list(asiterable(kwargs.pop("includes", []))) + [
                config.includedir
            ]
            self.component_dirs = {
                "coulomb": os.path.join(self.dirname, "Coulomb"),
                "vdw": os.path.join(self.dirname, "VDW"),
            }

            # for analysis
            self.stride = kwargs.pop("stride", 1)
            self.start = kwargs.pop("start", 0)
            self.stop = kwargs.pop("stop", None)
            self.SI = kwargs.pop("SI", True)

            # other variables
            #: Results from the analysis
            self.results = AttributeDict(
                xvg=AttributeDict(),
                dvdl=AttributeDict(),
                DeltaA=AttributeDict(),  # contains QuantityWithError
            )
            #: Generated run scripts
            self.scripts = AttributeDict()

            # sanity checks
            if os.path.exists(self.dirname):
                wmsg = (
                    "Directory %(dirname)r already exists --- will overwrite "
                    "existing files." % vars(self)
                )
                warnings.warn(wmsg)
                logger.warning(wmsg)

        # overrides pickle file so that we can run from elsewhere
        if not basedir is None:
            self.basedir = os.path.realpath(basedir)
        else:
            self.basedir = None

        try:
            self.filename = os.path.abspath(self.filename)
        except (AttributeError, TypeError):
            # default filename if none was provided
            self.filename = self.frombase(
                self.dirname, self.__class__.__name__ + os.extsep + "fep"
            )

        # override pickle file for this dangerous option: must be set
        # on a case-by-case basis
        self.permissive = kwargs.pop("permissive", False)

        logger.info(
            "Solvation free energy calculation for molecule "
            "%(molecule)s in solvent %(solvent_type)s.",
            vars(self),
        )
        logger.info("Base directory is %(basedir)r", vars(self))
        logger.info(
            "Using setup directories under %(dirname)r: %(component_dirs)r", vars(self)
        )
        logger.info("Default checkpoint file is %(filename)r", vars(self))
        logger.debug("Coulomb lambdas = %(coulomb)r", self.lambdas)
        logger.debug("VDW lambdas = %(vdw)r", self.lambdas)

        super(Gsolv, self).__init__(**kwargs)

[docs]    def frombase(self, *args):
        """Return path with :attr:`Gsolv.basedir` prefixed."""
        # wrap paths with frombase() and hopefully this allows us fairly
        # flexible use of the class, especially for analysis
        if self.basedir is None:
            return os.path.join(*args)
        return os.path.join(self.basedir, *args)

[docs]    def wname(self, component, lmbda):
        """Return name of the window directory itself.

        Typically something like ``VDW/0000``, ``VDW/0500``, ..., ``Coulomb/1000``
        """
        return os.path.join(self.component_dirs[component], "%04d" % (1000 * lmbda))

[docs]    def wdir(self, component, lmbda):
        """Return rooted path to the work directory for *component* and *lmbda*.

        (Constructed from :meth:`frombase` and :meth:`wname`.)
        """
        return self.frombase(self.wname(component, lmbda))

[docs]    def label(self, component):
        """Simple label for component, e.g. for use in filenames."""
        return self.schedules[component].label

[docs]    def tasklabel(self, component, lmbda):
        """Batch submission script name for a single task job."""
        return (
            self.molecule[:3]
            + "_"
            + self.schedules[component].label
            + "%04d" % (1000 * lmbda)
        )

[docs]    def arraylabel(self, component):
        """Batch submission script name for a job array."""
        return self.molecule[:3] + "_" + self.schedules[component].label

[docs]    def fep_dirs(self):
        """Generator for all simulation sub directories"""
        for component, lambdas in self.lambdas.items():
            for l in lambdas:
                yield self.wdir(component, l)

[docs]    def setup(self, **kwargs):
        """Prepare the input files for all Gromacs runs.

        :Keywords:
           *qscript*
               (List of) template(s) for batch submission scripts; if not set then
               the templates are used that were supplied to the constructor.
           *kwargs*
               Most kwargs are passed on to :func:`gromacs.setup.MD` although some
               are set to values that are required for the FEP functionality.

               *mdrun_opts*
                  list of options to :program:`mdrun`; ``-dhdl`` is always added
                  to this list as it is required for the thermodynamic integration
                  calculations
               *includes*
                  list of directories where Gromacs input files can be found

               The following keywords cannot be changed: *dirname*, *jobname*,
               *prefix*, *runtime*, *deffnm*. The last two can be specified when
               constructing :class:`Gsolv`.

        .. SeeAlso:: :func:`gromacs.setup.MD` and
                     :func:`gromacs.qsub.generate_submit_scripts`

        .. versionchanged:: 0.6.0
           Gromacs now uses option ``-dhdl`` instead of ``-dgdl``.
        """
        self.journal.start("setup")

        # -dgdl for FEP output (although that seems to have been changed to -dHdl in Gromacs 4.5.3)
        # NOW use -dhdl
        kwargs["mdrun_opts"] = " ".join([kwargs.pop("mdrun_opts", ""), "-dhdl"])
        kwargs["includes"] = asiterable(kwargs.pop("includes", [])) + self.includes
        kwargs["deffnm"] = self.deffnm
        kwargs.setdefault("maxwarn", 1)
        qsubargs = kwargs.copy()
        qsubargs["dirname"] = self.frombase(self.dirname)
        # handle templates separately (necessary for array jobs)
        qscripts = qsubargs.pop("sge", None) or self.qscript
        qscripts.extend(qsubargs.pop("qscript", []))  # also allow canonical 'templates'
        # make sure that the individual batch scripts are also written
        kwargs.setdefault("qscript", qscripts)

        for component, lambdas in self.lambdas.items():
            for l in lambdas:
                params = self._setup(component, l, foreign_lambdas=lambdas, **kwargs)

            # generate queuing system script for array job
            directories = [self.wdir(component, l) for l in lambdas]
            qsubargs["jobname"] = self.arraylabel(component)
            qsubargs["prefix"] = self.label(component) + "_"
            self.scripts[component] = gromacs.qsub.generate_submit_array(
                qscripts, directories, **qsubargs
            )
            logger.info(
                "[%s] Wrote array job scripts %r", component, self.scripts[component]
            )

        self.journal.completed("setup")
        self.save(self.filename)
        logger.info(
            "Saved state information to %r; reload later with G = %r.",
            self.filename,
            self,
        )
        logger.info("Finished setting up all individual simulations. Now run them...")
        params.pop("struct", None)  # scrub window-specific params
        return params

    def _setup(self, component, lmbda, foreign_lambdas, **kwargs):
        """Prepare the input files for an individual Gromacs runs."""

        # note that all arguments pertinent to the submission scripts should be in kwargs
        # and have been set in setup() so that it is easy to generate array job scripts
        logger.info("Preparing %(component)s for lambda=%(lmbda)g" % vars())

        wdir = self.wdir(component, lmbda)
        kwargs.setdefault("couple-intramol", "no")

        ### XXX Issue 20: if an entry is None then the dict will not be updated:
        ###     I *must* keep "none" as a legal string value
        kwargs.update(
            self.schedules[component].mdp_dict
        )  # sets soft core & lambda0/1 state

        if kwargs.pop("edr", True):
            logger.info("Setting dhdl file to edr format")
            kwargs.setdefault("separate-dhdl-file", "no")
        else:
            logger.info("Setting dhdl file to xvg format")
            kwargs.setdefault("separate-dhdl-file", "yes")

        foreign_lambdas = np.asarray(foreign_lambdas)
        lambda_index = np.where(foreign_lambdas == lmbda)[0][0]

        kwargs.update(
            dirname=wdir,
            struct=self.struct,
            top=self.top,
            mdp=self.mdp,
            ndx=self.ndx,
            mainselection=None,
            runtime=self.runtime,
            ref_t=self.Temperature,  # TODO: maybe not working yet, check _setup()
            gen_temp=self.Temperature,  # needed until gromacs.setup() is smarter
            qname=self.tasklabel(component, lmbda),
            free_energy="yes",
            couple_moltype=self.molecule,
            init_lambda_state=lambda_index,
            fep_lambdas=foreign_lambdas,
            calc_lambda_neighbors=-1,
        )

        return gromacs.setup.MD(**kwargs)

[docs]    def dgdl_xvg(self, *args):
        """Return filename of the dgdl XVG file.

        Recognizes uncompressed, gzipped (gz), and bzip2ed (bz2)
        files.

        :Arguments:
           *args*
               joins the arguments into a path and adds the default
               filename for the dvdl file

        :Returns: Checks if a compressed file exists and returns
                  the appropriate filename.

        :Raises: :exc:`IOError` with error code ENOENT if no file
                 could be found

        """
        EXTENSIONS = ("", os.path.extsep + "bz2", os.path.extsep + "gz")
        root = os.path.join(*args + (self.deffnm + ".xvg",))
        for ext in EXTENSIONS:
            fn = root + ext
            if os.path.exists(fn):
                return fn
        logger.error("Missing dgdl.xvg file %(root)r.", vars())
        raise IOError(errno.ENOENT, "Missing dgdl.xvg file", root)

[docs]    def dgdl_edr(self, *args):
        """Return filename of the dgdl EDR file.

        :Arguments:
           *args*
               joins the arguments into a path and adds the default
               filename for the dvdl file

        :Returns: path to EDR

        :Raises: :exc:`IOError` with error code ENOENT if no file
                 could be found

        """
        pattern = os.path.join(*args + (self.deffnm + "*.edr",))
        edrs = glob(pattern)
        if not edrs:
            logger.error("Missing dgdl.edr file %(pattern)r.", vars())
            raise IOError(errno.ENOENT, "Missing dgdl.edr file", pattern)
        return [os.path.abspath(i) for i in edrs]

[docs]    def dgdl_tpr(self, *args):
        """Return filename of the dgdl TPR file.

        :Arguments:
           *args*
               joins the arguments into a path and adds the default
               filename for the dvdl file

        :Returns: path to TPR

        :Raises: :exc:`IOError` with error code ENOENT if no file
                 could be found

        """
        fn = os.path.join(*args + (self.deffnm + ".tpr",))
        if not os.path.exists(fn):
            logger.error("Missing TPR file %(fn)r.", vars())
            raise IOError(errno.ENOENT, "Missing TPR file", fn)
        return fn

[docs]    def dgdl_total_edr(self, *args, **kwargs):
        """Return filename of the combined dgdl EDR file.

        :Arguments:
           *args*
               joins the arguments into a path and adds the default
               filename for the dvdl file

        :Keywords:
           *total_edr_name*
               Name of the user defined total edr file.

        :Returns: path to total EDR

        """
        total_edr_name = kwargs.get("total_edr_name", "total.edr")
        fn = os.path.join(*args + (total_edr_name,))
        return fn

[docs]    def convert_edr(self):
        """Convert EDR files to compressed XVG files."""

        logger.info("[%(dirname)s] Converting EDR -> XVG.bz2" % vars(self))

        for component, lambdas in self.lambdas.items():
            edr_files = [self.dgdl_edr(self.wdir(component, l)) for l in lambdas]
            tpr_files = [self.dgdl_tpr(self.wdir(component, l)) for l in lambdas]
            for tpr, edr in zip(tpr_files, edr_files):
                dirct = os.path.abspath(os.path.dirname(tpr))
                if len(edr) == 1:
                    total_edr = edr[0]
                else:
                    total_edr = self.dgdl_total_edr(dirct)
                    logger.info("  {0} --> {1}".format("edrs", total_edr))
                    gromacs.eneconv(f=edr, o=total_edr)
                xvgfile = os.path.join(dirct, self.deffnm + ".xvg")  # hack
                logger.info("  {0} --> {1}".format(total_edr, xvgfile))
                gromacs.g_energy(s=tpr, f=total_edr, odh=xvgfile)

[docs]    def collect(self, stride=None, autosave=True, autocompress=True):
        """Collect dV/dl from output.

        :Keywords:
           *stride*
              read data every *stride* lines, ``None`` uses the class default
           *autosave*
              immediately save the class pickle fep file
           *autocompress*
              compress the xvg file with bzip2 (saves >70% of space)
        """

        from gromacs.formats import XVG

        if not stride is None:
            self.stride = stride

        if autocompress:
            # must be done before adding to results.xvg or we will not find the file later
            self.compress_dgdl_xvg()

        logger.info(
            "[%(dirname)s] Finding dgdl xvg files, reading with "
            "stride=%(stride)d permissive=%(permissive)r." % vars(self)
        )

        for component, lambdas in self.lambdas.items():
            xvg_files = [self.dgdl_xvg(self.wdir(component, l)) for l in lambdas]
            self.results.xvg[component] = (
                np.array(lambdas),
                [
                    XVG(xvg, permissive=self.permissive, stride=self.stride)
                    for xvg in xvg_files
                ],
            )
        if autosave:
            self.save()

[docs]    def compress_dgdl_xvg(self):
        """Compress *all* dgdl xvg files with bzip2.

        .. Note:: After running this method you might want to run
           :meth:`collect` to ensure that the results in :attr:`results.xvg`
           point to the *compressed* files. Otherwise :exc:`IOError` might
           occur which fail to find a `md.xvg` file.

        """
        for component, lambdas in self.lambdas.items():
            xvg_files = [self.dgdl_xvg(self.wdir(component, l)) for l in lambdas]
            for xvg in xvg_files:
                root, ext = os.path.splitext(xvg)
                if ext == os.path.extsep + "xvg":
                    fnbz2 = xvg + os.path.extsep + "bz2"
                    logger.info(
                        "[%s] Compressing dgdl file %r with bzip2", self.dirname, xvg
                    )
                    # speed is similar to 'bzip2 -9 FILE' (using a 1 Mio buffer)
                    # (Since GW 0.8, openany() does not take kwargs anymore so the write buffer cannot be
                    # set anymore (buffering=1048576) so the performance might be lower in MDPOW >= 0.7.0)
                    with open(xvg, "rb", buffering=1048576) as source:
                        with openany(fnbz2, "wb") as target:
                            target.writelines(source)
                    if os.path.exists(fnbz2) and os.path.exists(xvg):
                        os.unlink(xvg)
                    if not os.path.exists(fnbz2):
                        logger.error(
                            "[%s] Failed to compress %r --- mysterious!",
                            self.dirname,
                            fnbz2,
                        )
                    else:
                        logger.info(
                            "[%s] Compression complete: %r", self.dirname, fnbz2
                        )

[docs]    def contains_corrupted_xvgs(self):
        """Check if any of the source datafiles had reported corrupted lines.

        :Returns: ``True`` if any of the xvg dgdl files were produced with the
                  permissive=True flag and had skipped lines.

        For debugging purposes, the number of corrupted lines are stored in
        :attr:`Gsolv._corrupted` as dicts of dicts with the component as
        primary and the lambda as secondary key.
        """

        def _lencorrupted(xvg):
            try:
                return len(xvg.corrupted_lineno)
            except AttributeError:  # backwards compatible (pre gw 0.1.10 are always ok)
                return 0
            except TypeError:  # len(None): XVG.parse() has not been run yet
                return 0  # ... so we cannot conclude that it does contain bad ones

        corrupted = {}
        self._corrupted = {}  # debugging ...
        for component, (lambdas, xvgs) in self.results.xvg.items():
            corrupted[component] = np.any([(_lencorrupted(xvg) > 0) for xvg in xvgs])
            self._corrupted[component] = dict(
                ((l, _lencorrupted(xvg)) for l, xvg in zip(lambdas, xvgs))
            )
        return np.any([x for x in corrupted.values()])

[docs]    def analyze(self, force=False, stride=None, autosave=True, ncorrel=25000):
        r"""Extract dV/dl from output and calculate dG by TI.

        Thermodynamic integration (TI) is performed on the individual
        component window calculation (typically the Coulomb and the
        VDW part, :math:`\Delta A_{\mathrm{coul}}` and :math:`\Delta
        A_{\mathrm{vdW}}`). :math:`\Delta A_{\mathrm{coul}}` is the free
        energy component of discharging the molecule and :math:`\Delta
        A_{\mathrm{vdW}}` of decoupling (switching off LJ interactions
        with the environment). The free energy components must be
        interpreted in this way because we defined ``lambda=0`` as
        interaction switched on and ``lambda=1`` as switched off.

        .. math::
            \Delta A^{*} = -(\Delta A_{\mathrm{coul}} + \Delta A_{\mathrm{vdw}})

        Data are stored in :attr:`Gsolv.results`.

        The dV/dlambda graphs are integrated with the composite Simpson's rule
        (and if the number of datapoints are even, the first interval is
        evaluated with the trapezoidal rule); see :func:`scipy.integrate.simps`
        for details). Note that this implementation of Simpson's rule does not
        require equidistant spacing on the lambda axis.

        For the Coulomb part using Simpson's rule has been shown to produce
        more accurate results than the trapezoidal rule [Jorge2010]_.

        Errors are estimated from the errors of the individual <dV/dlambda>:

         1. The error of the mean <dV/dlambda> is calculated via the decay time
            of the fluctuation around the mean. ``ncorrel`` is the max number of
            samples that is going to be used to calculate the autocorrelation
            time via a FFT. See :func:`numkit.timeseries.tcorrel`.

         2. The error on the integral is calculated analytically via
            propagation of errors through Simpson's rule (with the
            approximation that all spacings are assumed to be equal; taken as
            the average over all spacings as implemented in
            :func:`numkit.integration.simps_error`).

        .. Note:: For the Coulomb part, which typically only contains about 5
           lambdas, it is recommended to have a odd number of lambda values to
           fully benefit from the higher accuracy of the integration scheme.

        .. [Jorge2010] M. Jorge, N.M. Garrido, A.J. Queimada, I.G. Economou,
                       and E.A. Macedo. Effect of the integration method on the
                       accuracy and computational efficiency of free energy
                       calculations using thermodynamic integration. Journal of
                       Chemical Theory and Computation, 6 (4):1018--1027,
                       2010. 10.1021/ct900661c.

        :Keywords:
          *force*
              reload raw data even though it is already loaded
          *stride*
              read data every *stride* lines, ``None`` uses the class default
          *autosave*
              save to the pickle file when results have been computed
          *ncorrel*
              aim for <= 25,000 samples for t_correl

        .. rubric:: Notes

        The error on the mean of the data :math:`\epsilon_y`, taking
        the correlation time into account, is calculated according to
        [FrenkelSmit2002]_ `p526`_:

        .. math::

           \epsilon_y  = \sqrt{2 \tau_c \mathrm{acf}(0)/T}

        where :math:`\mathrm{acf}()` is the autocorrelation function
        of the fluctuations around the mean, :math:`y - \langle y
        \rangle`, :math:`\tau_c` is the correlation time, and :math:`T`
        the total length of the simulation

        .. [FrenkelSmit2002] D. Frenkel and B. Smit, Understanding
                             Molecular Simulation. Academic Press, San
                             Diego 2002

        .. _p526: http://books.google.co.uk/books?id=XmyO2oRUg0cC&pg=PA526
        """
        stride = stride or self.stride
        logger.info("Analysis stride is %s.", stride)

        if force or not self.has_dVdl():
            try:
                self.collect(stride=stride, autosave=False)
            except IOError as err:
                if err.errno == errno.ENOENT:
                    self.convert_edr()
                    self.collect(stride=stride, autosave=False)
                else:
                    logger.exception(err)
                    raise
        else:
            logger.info("Analyzing stored data.")

        # total free energy difference at const P (all simulations are done in NPT)
        GibbsFreeEnergy = QuantityWithError(0, 0)

        for component, (lambdas, xvgs) in self.results.xvg.items():
            logger.info(
                "[%s %s] Computing averages <dV/dl> and errors for %d lambda values.",
                self.molecule,
                component,
                len(lambdas),
            )
            # for TI just get the average dv/dl value (in array column 1; col 0 is the time)
            # (This can take a while if the XVG is now reading the array from disk first time)
            # Use XVG class properties: first data in column 0!
            Y = np.array([x.mean[0] for x in xvgs])
            stdY = np.array([x.std[0] for x in xvgs])

            # compute auto correlation time and error estimate for independent samples
            # (this can take a while). x.array[0] == time, x.array[1] == dHdl
            # nstep is calculated to give ncorrel samples (or all samples if less than ncorrel are
            # available)
            tc_data = [
                numkit.timeseries.tcorrel(
                    x.array[0],
                    x.array[1],
                    nstep=int(np.ceil(len(x.array[0]) / float(ncorrel))),
                )
                for x in xvgs
            ]
            DY = np.array([tc["sigma"] for tc in tc_data])
            tc = np.array([tc["tc"] for tc in tc_data])

            self.results.dvdl[component] = {
                "lambdas": lambdas,
                "mean": Y,
                "error": DY,
                "stddev": stdY,
                "tcorrel": tc,
            }
            # Combined Simpson rule integration:
            # even="last" because dV/dl is smoother at the beginning so using trapezoidal
            # integration there makes less of an error (one hopes...)
            a = scipy.integrate.simps(Y, x=lambdas, even="last")
            da = numkit.integration.simps_error(DY, x=lambdas, even="last")
            self.results.DeltaA[component] = QuantityWithError(a, da)
            GibbsFreeEnergy += self.results.DeltaA[
                component
            ]  # error propagation is automagic!

        # hydration free energy Delta A = -(Delta A_coul + Delta A_vdw)
        GibbsFreeEnergy *= -1
        self.results.DeltaA.Gibbs = GibbsFreeEnergy

        if autosave:
            self.save()

        self.logger_DeltaA0()
        return self.results.DeltaA.Gibbs

[docs]    def collect_alchemlyb(
        self, SI=True, start=0, stop=None, stride=None, autosave=True, autocompress=True
    ):
        """Collect the data files using alchemlyb.

        Unlike :meth:`collect`, this method can subsample with the
        statistical inefficiency (parameter `SI`).
        """
        extract = self.estimators[self.method]["extract"]

        if autocompress:
            # must be done before adding to results.xvg or we will not find the file later
            self.compress_dgdl_xvg()

        logger.info(
            "[%(dirname)s] Finding dgdl xvg files, reading with "
            "stride=%(stride)d permissive=%(permissive)r." % vars(self)
        )
        for component, lambdas in self.lambdas.items():
            val = []
            for l in lambdas:
                xvg_file = self.dgdl_xvg(self.wdir(component, l))
                xvg_df = extract(xvg_file, T=self.Temperature).iloc[start:stop:stride]
                if SI:
                    logger.info(
                        "Performing statistical inefficiency analysis for window %s %04d"
                        % (component, 1000 * l)
                    )
                    ts = _extract_dataframe(xvg_file).iloc[start:stop:stride]
                    ts = pd.DataFrame({"time": ts.iloc[:, 0], "dhdl": ts.iloc[:, 1]})
                    ts = ts.set_index("time")
                    # use the statistical_inefficiency function to subsample the data
                    xvg_df = statistical_inefficiency(xvg_df, ts, conservative=True)
                val.append(xvg_df)
            self.results.xvg[component] = (np.array(lambdas), pd.concat(val))

        if autosave:
            self.save()

[docs]    def analyze_alchemlyb(
        self, SI=True, start=0, stop=None, stride=None, force=False, autosave=True
    ):
        """Compute the free energy from the simulation data with alchemlyb.

        Unlike :meth:`analyze`, the MBAR estimator is available (in
        addition to TI). Note that SI should be enabled for meaningful
        error estimates.

        """
        stride = stride or self.stride
        start = start or self.start
        stop = stop or self.stop

        logger.info("Analysis stride is %s.", stride)
        logger.info("Analysis starts from frame %s.", start)
        logger.info("Analysis stops at frame %s.", stop)

        if self.method in ["TI", "BAR", "MBAR"]:
            estimator = self.estimators[self.method]["estimator"]
        else:
            errmsg = "The method is not supported."
            logger.error(errmsg)
            raise ValueError(errmsg)

        if force or not self.has_dVdl():
            try:
                self.collect_alchemlyb(SI, start, stop, stride, autosave=False)
            except IOError as err:
                if err.errno == errno.ENOENT:
                    self.convert_edr()
                    self.collect_alchemlyb(SI, start, stop, stride, autosave=False)
                else:
                    logger.exception(err)
                    raise
        else:
            logger.info("Analyzing stored data.")

        # total free energy difference at const P (all simulations are done in NPT)
        GibbsFreeEnergy = QuantityWithError(0, 0)

        for component, (lambdas, xvgs) in self.results.xvg.items():
            result = estimator().fit(xvgs)
            if self.method == "BAR":
                DeltaA = QuantityWithError(0, 0)
                a_s = np.diagonal(result.delta_f_, offset=1)
                da_s = np.diagonal(result.d_delta_f_, offset=1)
                for a, da in zip(a_s, da_s):
                    DeltaA += QuantityWithError(a, da)
                self.results.DeltaA[component] = kBT_to_kJ(DeltaA, self.Temperature)
            else:
                a = result.delta_f_.loc[0.00, 1.00]
                da = result.d_delta_f_.loc[0.00, 1.00]
                self.results.DeltaA[component] = kBT_to_kJ(
                    QuantityWithError(a, da), self.Temperature
                )
            GibbsFreeEnergy += self.results.DeltaA[
                component
            ]  # error propagation is automagic!

        # hydration free energy Delta A = -(Delta A_coul + Delta A_vdw)
        GibbsFreeEnergy *= -1
        self.results.DeltaA.Gibbs = GibbsFreeEnergy

        if autosave:
            self.save()

        self.logger_DeltaA0()
        return self.results.DeltaA.Gibbs

        if autosave:
            self.save()

[docs]    def write_DeltaA0(self, filename, mode="w"):
        """Write free energy components to a file.

        :Arguments:
            *filename*
                 name of the text file
            *mode*
                 'w' for overwrite or 'a' for append ['w']

        Format::
          .                 ----- kJ/mol ------
          molecule solvent  total  coulomb  vdw
        """
        with open(filename, mode) as tab:
            tab.write(self.summary() + "\n")

[docs]    def summary(self):
        """Return a string that summarizes the energetics.

        Each energy component is followed by its error.

        Format::

          .                 ------ kJ/mol -----
          molecule solvent  total  coulomb  vdw

        """
        fmt = "%-10s %-14s %+8.2f %8.2f  %+8.2f %8.2f  %+8.2f %8.2f"
        d = self.results.DeltaA
        return fmt % (
            (self.molecule, self.solvent_type)
            + d.Gibbs.astuple()
            + d.coulomb.astuple()
            + d.vdw.astuple()
        )

[docs]    def logger_DeltaA0(self):
        """Print the free energy contributions (errors in parentheses)."""
        if not "DeltaA" in self.results or len(self.results.DeltaA) == 0:
            logger.info("No DeltaA free energies computed yet.")
            return

        logger.info("DeltaG0 = -(DeltaG_coul + DeltaG_vdw)")
        for component, energy in self.results.DeltaA.items():
            logger.info(
                "[%s] %s solvation free energy (%s) %g (%.2f) kJ/mol",
                self.molecule,
                self.solvent_type.capitalize(),
                component,
                energy.value,
                energy.error,
            )

[docs]    def has_dVdl(self):
        """Check if dV/dl data have already been collected.

        :Returns: ``True`` if the dV/dl data have bee aquired
                  (:meth:`Gsolv.collect`) for all FEP simulations.
        """
        try:
            if len(self.results.xvg) == 0:
                return False
        except AttributeError:
            return False
        return np.all(
            np.array([len(xvgs) for (lambdas, xvgs) in self.results.xvg.values()]) > 0
        )

[docs]    def plot(self, **kwargs):
        """Plot the TI data with error bars.

        Run :meth:`mdpow.fep.Gsolv.analyze` first.

        All *kwargs* are passed on to :func:`pylab.errorbar`.

        :Returns: The axes of the subplot.
        """
        import matplotlib
        import matplotlib.pyplot as plt

        kwargs.setdefault("color", "black")
        kwargs.setdefault("capsize", 0)
        kwargs.setdefault("elinewidth", 2)

        try:
            if self.results.DeltaA.Gibbs is None or len(self.results.dvdl) == 0:
                raise KeyError
        except KeyError:
            logger.info("Data were not analyzed yet -- doing that now... patience.")
            self.analyze()

        dvdl = self.results.dvdl
        nplots = len(dvdl)
        fig, axs = plt.subplots(nrows=1, ncols=nplots)
        for i, component in enumerate(np.sort(dvdl.keys())):  # stable plot order
            x, y, dy = [dvdl[component][k] for k in ("lambdas", "mean", "error")]
            iplot = i
            ax = axs[i]
            energy = self.results.DeltaA[component]
            label = r"$\Delta A^{\rm{%s}}_{\rm{%s}} = %.2f\pm%.2f$ kJ/mol" % (
                component,
                self.solvent_type,
                energy.value,
                energy.error,
            )
            ax.errorbar(x, y, yerr=dy, label=label, **kwargs)
            ax.set_xlabel(r"$\lambda$")
            ax.legend(loc="best")
            ax.set_xlim(-0.05, 1.05)
        axs[0].set_ylabel(r"$dV/d\lambda$ in kJ/mol")
        fig.suptitle(
            r"Free energy difference $\Delta A^{0}_{\rm{%s}}$ for %s: $%.2f\pm%.2f$ kJ/mol"
            % (
                (
                    self.solvent_type,
                    self.molecule,
                )
                + self.results.DeltaA.Gibbs.astuple()
            )
        )
        fig.savefig("DeltaA.png")
        plt.close()
        return fig

[docs]    def qsub(self, script=None):
        """Submit a batch script locally.

        If *script* == ``None`` then take the first script (works well if only
        one template was provided).
        """
        from gromacs.qsub import relpath

        def choose_script_from(scripts):
            if script is None:
                s = scripts[0]
            elif script in scripts:
                s = script
            else:
                errmsg = "No script matching %(script)r in %(scripts)r" % vars()
                logger.error(errmsg)
                raise ValueError(errmsg)
            return s

        with in_dir(self.dirname, create=False):
            for component, scripts in self.scripts.items():
                s = relpath(
                    choose_script_from(scripts), self.dirname
                )  # relative to dirname
                cmd = ["qsub", s]
                logger.debug("[%s] submitting locally: %s", " ".join(cmd), component)
                rc = call(cmd)
                if rc != 0:
                    errmsg = "submitting job %(s)r failed with rc=%(rc)d" % vars()
                    logger.error(errmsg)
                    raise OSError(errmsg)

        logger.info(
            "[%r] Submitted jobs locally for %r", self.dirname, self.scripts.keys()
        )

    def __repr__(self):
        return "%s(filename=%r)" % (self.__class__.__name__, self.filename)


[docs]class Ghyd(Gsolv):
    """Sets up and analyses MD to obtain the hydration free energy of a solute."""

    solvent_default = "water"
    dirname_default = os.path.join(Gsolv.topdir_default, solvent_default)


[docs]class Gcyclo(Gsolv):
    solvent_default = "cyclohexane"
    dirname_default = os.path.join(Gsolv.topdir_default, solvent_default)


[docs]class Goct(Gsolv):
    """Sets up and analyses MD to obtain the solvation free energy of a solute in octanol.

    The *coulomb* lambda schedule is enhanced compared to water as the initial
    part of the dV/dl curve is quite sensitive. By adding two additional points
    we hope to reduce the overall error on the dis-charging free energy.
    """

    solvent_default = "octanol"
    dirname_default = os.path.join(Gsolv.topdir_default, solvent_default)

    schedules = {
        "coulomb": FEPschedule(
            name="coulomb",
            description="dis-charging vdw+q --> vdw",
            label="Coul",
            couple_lambda0="vdw-q",
            couple_lambda1="vdw",
            sc_alpha=0,  # linear scaling for coulomb
            # lambdas=[0, 0.25, 0.5, 0.75, 1.0],  # default
            lambdas=[0, 0.125, 0.25, 0.375, 0.5, 0.75, 1.0],  # +0.125, 0.375 enhanced
        ),
        "vdw": FEPschedule(
            name="vdw",
            description="decoupling vdw --> none",
            label="VDW",
            couple_lambda0="vdw",
            couple_lambda1="none",
            sc_alpha=0.5,
            sc_power=1,
            sc_sigma=0.3,  # recommended values
            lambdas=[
                0.0,
                0.05,
                0.1,
                0.2,
                0.3,
                0.4,
                0.5,
                0.6,  # defaults
                0.65,
                0.7,
                0.75,
                0.8,
                0.85,
                0.9,
                0.95,
                1,
            ],
        ),
    }


class Gwoct(Goct):
    """Sets up and analyses MD to obtain the solvation free energy of a solute in wet octanol.

    The *coulomb* lambda schedule is enhanced compared to water as the initial
    part of the dV/dl curve is quite sensitive. By adding two additional points
    we hope to reduce the overall error on the dis-charging free energy.
    """

    solvent_default = "wetoctanol"
    dirname_default = os.path.join(Gsolv.topdir_default, solvent_default)


[docs]class Gtol(Gsolv):
    """Sets up and analyses MD to obtain the solvation free energy of a solute in toluene."""

    solvent_default = "toluene"
    dirname_default = os.path.join(Gsolv.topdir_default, solvent_default)


def p_transfer(G1, G2, **kwargs):
    """Compute partition coefficient from two :class:`Gsolv` objects.

       The order determines the direction of transfer: G1 --> G2.

       E.g.: ``G1 = Gwat`` and ``G2 = Goct`` will compute a water-octanol
       transfer free energy and a water-octanol partition coefficient.

       transfer free energy from water into octanol::

            DeltaDeltaG0 = DeltaG0_oct - DeltaG0_water

       water-octanol partition coefficient::

            log P_oct/wat =  log [X]_oct/[X]_wat

    .. note::

       The partition coefficient corresponds to a transfer in the *opposite*
       direction compared to the free energy difference computed here. For
       instance, ``log P_oct/water`` is related to the free energy of transfer
       from octanol to water, ``Delta G_water - Delta G_oct``. ``log
       P_oct/water < 0`` means that the water phase is preferred, i.e. the
       probability to find the solute in octanol is *smaller* than the
       probability to find it in the water phase, or in other words, the
       hydration free energy ``Delta G_water`` is smaller (more negative) than
       the octanol solvation free energy ``Delta G_oct``.


    :Arguments:
       *G1*, *G2*
           *G1* and *G2* should be two  :class:`Gsolv` instances,
           order matters.
       *force*
           force rereading of data files even if some data were already stored [False]
       *stride*
           analyze every *stride*-th datapoint in the dV/dlambda files
       *start*
           Start frame of data analyzed in every fep window.
       *stop*
           Stop frame of data analyzed in every fep window.
       *SI*
           Set to ``True`` if you want to perform statistical inefficiency
           to preprocess the data.
       *estimator*
           Set to ``alchemlyb`` if you want to use alchemlyb estimators,
           or ``mdpow`` if you want the default TI method.
       *method*
           Use `TI`, `BAR` or `MBAR` method in `alchemlyb`, or `TI` in `mdpow`.

    :Returns: (transfer free energy, log10 of the water-octanol partition coefficient = log Pow)

    """

    kwargs.setdefault("force", False)
    estimator = kwargs.pop("estimator", "alchemlyb")
    if not estimator in ("mdpow", "alchemlyb"):
        errmsg = (
            "estimator = %r is not supported, must be 'mdpow' or 'alchemlyb'"
            % estimator
        )
        logger.error(errmsg)
        raise ValueError(errmsg)

    if G1.molecule != G2.molecule:
        raise ValueError("The two simulations were done for different molecules.")
    if G1.Temperature != G2.Temperature:
        raise ValueError("The two simulations were done at different temperatures.")

    logger.info(
        "[%s] transfer free energy %s --> %s calculation",
        G1.molecule,
        G1.solvent_type,
        G2.solvent_type,
    )
    for G in (G1, G2):
        G_kwargs = kwargs.copy()
        # for fep files generated with old code which doesn't have these attributes
        if not hasattr(G, "start"):
            G.start = G_kwargs.pop("start", 0)
        if not hasattr(G, "stop"):
            G.stop = G_kwargs.pop("stop", None)
        if not hasattr(G, "SI"):
            G_kwargs.setdefault("SI", True)
        else:
            G_kwargs.setdefault("SI", G.SI)

        # for this version. use the method given instead of the one in the input cfg file
        G.method = G_kwargs.pop("method", "MBAR")
        if estimator == "mdpow":
            if G.method != "TI":
                errmsg = (
                    "Method %s is not implemented in MDPOW, use estimator='alchemlyb'"
                    % G.method
                )
                logger.error(errmsg)
                raise ValueError(errmsg)

        logger.info("The solvent is %s .", G.solvent_type)
        if kwargs["force"] or "Gibbs" not in G.results.DeltaA:
            # write out the settings when the analysis is performed
            logger.info("Estimator is %s.", estimator)
            logger.info("Free energy calculation method is %s.", G.method)

            if estimator == "mdpow":
                G_kwargs.pop("SI", None)  # G.analyze() does not know SI
                G.analyze(**G_kwargs)
            elif estimator == "alchemlyb":
                logger.info(
                    "Statistical inefficiency analysis will %s be performed."
                    % ("" if G_kwargs["SI"] else "not")
                )
                G.analyze_alchemlyb(**G_kwargs)
        else:
            logger.info("Using already calculated free energy DeltaA")

    # x.Gibbs are QuantityWithError so they do error propagation
    transferFE = G2.results.DeltaA.Gibbs - G1.results.DeltaA.Gibbs
    # note minus sign, with our convention of the free energy difference to be
    # opposite to the partition coefficient.
    logPow = -transferFE / (kBOLTZ * G1.Temperature) * np.log10(np.e)

    molecule = G1.molecule

    # lower case initials, in reverse order of transfer, e.g.
    # water -> octanol:      P_ow
    # water -> cyclohexane:  P_cw
    # water -> toluene:      P_tw
    coefficient = "P_{0}{1}".format(
        G2.solvent_type.lower()[0], G1.solvent_type.lower()[0]
    )

    logger.info("[%s] Values at T = %g K", molecule, G1.Temperature)
    logger.info(
        "[%s] Free energy of transfer %s --> %s: %.3f (%.3f) kJ/mol",
        molecule,
        G1.solvent_type,
        G2.solvent_type,
        transferFE.value,
        transferFE.error,
    )
    logger.info(
        "[%s] log %s: %.3f (%.3f)", molecule, coefficient, logPow.value, logPow.error
    )

    return transferFE, logPow


[docs]def pOW(G1, G2, **kwargs):
    """Compute water-octanol partition coefficient from two :class:`Gsolv` objects.

    transfer free energy from water into octanol::

            DeltaDeltaG0 = DeltaG0_oct - DeltaG0_water

    octanol/water partition coefficient::

            log P_oct/wat =  log [X]_oct/[X]_wat

    :Arguments:
       *G1*, *G2*
           *G1* and *G2* should be a :class:`Ghyd` and a :class:`Goct` instance,
           but order does not matter
       *force*
           force rereading of data files even if some data were already stored [False]
       *stride*
           analyze every *stride*-th datapoint in the dV/dlambda files
       *start*
           Start frame of data analyzed in every fep window.
       *stop*
           Stop frame of data analyzed in every fep window.
       *SI*
           Set to ``True`` if you want to perform statistical inefficiency
           to preprocess the data.
       *estimator*
           Set to ``alchemlyb`` if you want to use alchemlyb estimators,
           or ``mdpow`` if you want the default TI method.
       *method*
           Use `TI`, `BAR` or `MBAR` method in `alchemlyb`, or `TI` in `mdpow`.

    :Returns: (transfer free energy, log10 of the octanol/water partition coefficient = log Pow)
    """
    if G1.solvent_type == "water" and G2.solvent_type == "octanol":
        args = (G1, G2)
    elif G1.solvent_type == "octanol" and G2.solvent_type == "water":
        args = (G2, G1)
    else:
        msg = "For pOW need water and octanol simulations but instead got {0} and {1}".format(
            G1.solvent_type, G2.solvent_type
        )
        logger.error(msg)
        raise ValueError(msg)
    return p_transfer(*args, **kwargs)


[docs]def pCW(G1, G2, **kwargs):
    """Compute water-cyclohexane partition coefficient from two :class:`Gsolv` objects.

    transfer free energy from water into cyclohexane::

            DeltaDeltaG0 = DeltaG0_cyclohexane - DeltaG0_water

    cyclohexane/water partition coefficient::

            log P_CW =  log [X]_cyclohexane/[X]_water

    :Arguments:
       *G1*, *G2*
           *G1* and *G2* should be a :class:`Ghyd` and a :class:`Gcyclo` instance,
           but order does not matter
       *force*
           force rereading of data files even if some data were already stored [False]
       *stride*
           analyze every *stride*-th datapoint in the dV/dlambda files
       *start*
           Start frame of data analyzed in every fep window.
       *stop*
           Stop frame of data analyzed in every fep window.
       *SI*
           Set to ``True`` if you want to perform statistical inefficiency
           to preprocess the data.
       *estimator*
           Set to ``alchemlyb`` if you want to use alchemlyb estimators,
           or ``mdpow`` if you want the default TI method.
       *method*
           Use `TI`, `BAR` or `MBAR` method in `alchemlyb`, or `TI` in `mdpow`.

    :Returns: (transfer free energy, log10 of the cyclohexane/water partition coefficient = log Pcw)
    """
    if G1.solvent_type == "water" and G2.solvent_type == "cyclohexane":
        args = (G1, G2)
    elif G1.solvent_type == "cyclohexane" and G2.solvent_type == "water":
        args = (G2, G1)
    else:
        msg = "For pCW need water and cyclohexane simulations but instead got {0} and {1}".format(
            G1.solvent_type, G2.solvent_type
        )
        logger.error(msg)
        raise ValueError(msg)
    return p_transfer(*args, **kwargs)


[docs]def pTW(G1, G2, **kwargs):
    """Compute water-toluene partition coefficient from two :class:`Gsolv` objects.

    transfer free energy from water into toluene::

            DeltaDeltaG0 = DeltaG0_tol - DeltaG0_water

    toluene/water partition coefficient::

            log P_tol/wat =  log [X]_tol/[X]_wat

    :Arguments:
       *G1*, *G2*
           *G1* and *G2* should be a :class:`Ghyd` and a :class:`Gtol` instance,
           but order does not matter
       *force*
           force rereading of data files even if some data were already stored [False]
       *stride*
           analyze every *stride*-th datapoint in the dV/dlambda files
       *start*
           Start frame of data analyzed in every fep window.
       *stop*
           Stop frame of data analyzed in every fep window.
       *SI*
           Set to ``True`` if you want to perform statistical inefficiency
           to preprocess the data.
       *estimator*
           Set to ``alchemlyb`` if you want to use alchemlyb estimators,
           or ``mdpow`` if you want the default TI method.
       *method*
           Use `TI`, `BAR` or `MBAR` method in `alchemlyb`, or `TI` in `mdpow`.

    :Returns: (transfer free energy, log10 of the toluene/water partition coefficient = log Ptw)
    """
    if G1.solvent_type == "water" and G2.solvent_type == "toluene":
        args = (G1, G2)
    elif G1.solvent_type == "toluene" and G2.solvent_type == "water":
        args = (G2, G1)
    else:
        msg = "For pTW need water and toluene simulations but instead got {0} and {1}".format(
            G1.solvent_type, G2.solvent_type
        )
        logger.error(msg)
        raise ValueError(msg)
    return p_transfer(*args, **kwargs)