From 91a6d03def7c91ac54d0b2f596ab7354fcdb8cbc Mon Sep 17 00:00:00 2001 From: shuai Date: Mon, 30 Oct 2017 11:12:54 -0400 Subject: [PATCH 01/14] switch to use subsampleCorrelatedData in pymbar to get the indices of the subsampled dataset --- src/alchemlyb/preprocessing/subsampling.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/src/alchemlyb/preprocessing/subsampling.py b/src/alchemlyb/preprocessing/subsampling.py index 50703286..cc6f201e 100644 --- a/src/alchemlyb/preprocessing/subsampling.py +++ b/src/alchemlyb/preprocessing/subsampling.py @@ -4,6 +4,7 @@ import numpy as np from pymbar.timeseries import statisticalInefficiency from pymbar.timeseries import detectEquilibration +from pymbar.timeseries import subsampleCorrelatedData def _check_multiple_times(df): @@ -97,14 +98,22 @@ def statistical_inefficiency(df, series=None, lower=None, upper=None, step=None) # calculate statistical inefficiency of series statinef = statisticalInefficiency(series) + + #use the subsampleCorrelatedData function to subsample the data + indices = subsampleCorrelatedData(series, g=statinef) + picked_time_index = [] + #pick the time index for the pandas dataframe based on the python indices from subsample + for s_index, s_index_pair in enumerate(series.index): + if s_index in indices: + picked_time_index.append(s_index_pair[0]) # we round up - statinef = int(np.rint(statinef)) - + #statinef = int(np.rint(statinef)) # subsample according to statistical inefficiency - series = series.iloc[::statinef] + #series = series.iloc[::statinef] - df = df.loc[series.index] + #df = df.loc[series.index] + df = df.loc[picked_time_index] else: df = slicing(df, lower=lower, upper=upper, step=step) From 05fbdf0486533cd7db47e2cd777ed541699ce054 Mon Sep 17 00:00:00 2001 From: shuai Date: Mon, 30 Oct 2017 11:19:24 -0400 Subject: [PATCH 02/14] add the amber TI parser to get dhdl --- src/alchemlyb/parsing/amber.py | 265 +++++++++++++++++++++++++++++++++ 1 file changed, 265 insertions(+) create mode 100644 src/alchemlyb/parsing/amber.py diff --git a/src/alchemlyb/parsing/amber.py b/src/alchemlyb/parsing/amber.py new file mode 100644 index 00000000..bfae1ae6 --- /dev/null +++ b/src/alchemlyb/parsing/amber.py @@ -0,0 +1,265 @@ +"""Parsers for extracting alchemical data from amber output files. +Most of the file parsing part are inheriting from alchemical-analysis +Change the final format to pandas to be consistent with the alchemlyb format +""" + +import pandas as pd +import re +import numpy as np +import os + +def convert_to_pandas(file_datum, ): + data_dic = {} + data_dic["dHdl"] = [] + data_dic["lambdas"] = [] + data_dic["time"] = [] + for frame_index, frame_dhdl in enumerate(file_datum.gradients): + data_dic["dHdl"].append(frame_dhdl) + data_dic["lambdas"].append(file_datum.clambda) + #here we need to convert dt to ps unit from ns + frame_time = file_datum.t0 + (frame_index + 1) * file_datum.dt*1000 + data_dic["time"].append(frame_time) + df = pd.DataFrame(data_dic["dHdl"], columns=["dHdl"], index =pd.Float64Index(data_dic["time"], name='time')) + df["lambdas"] = data_dic["lambdas"][0] + df = df.reset_index().set_index(['time'] + ['lambdas']) + return df + +DVDL_COMPS = ['BOND', 'ANGLE', 'DIHED', '1-4 NB', '1-4 EEL', 'VDWAALS', + 'EELEC', 'RESTRAINT'] +_FP_RE = r'[+-]?(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?' +_MAGIC_CMPR = { + '\x1f\x8b\x08': ('gzip', 'GzipFile'), # last byte is compression method + '\x42\x5a\x68': ('bz2', 'BZ2File') +} + +def any_none(sequence): + """Check if any element of a sequence is None.""" + + for element in sequence: + if element is None: + return True + + return False + +def _pre_gen(it, first): + """A generator that returns first first if it exists.""" + + if first: + yield first + + while it: + yield it.next() + +class SectionParser(object): + """ + A simple parser to extract data values from sections. + """ + def __init__(self, filename): + """Opens a file according to its file type.""" + self.filename = filename + with open(filename, 'rb') as f: + magic = f.read(3) # NOTE: works because all 3-byte headers + try: + method = _MAGIC_CMPR[magic] + except KeyError: + open_it = open + else: + open_it = getattr(__import__(method[0]), method[1]) + try: + self.fileh = open_it(self.filename, 'rb') + self.filesize = os.stat(self.filename).st_size + except IOError: + raise SystemExit('ERROR: cannot open file %s' % filename) + self.lineno = 0 + def skip_lines(self, nlines): + """Skip a given number of files.""" + lineno = 0 + for line in self: + lineno += 1 + if lineno > nlines: + return line + return None + def skip_after(self, pattern): + """Skip until after a line that matches a regex pattern.""" + for line in self: + match = re.search(pattern, line) + if match: + break + return self.fileh.tell() != self.filesize + def extract_section(self, start, end, fields, limit=None, extra='', + debug=False): + """ + Extract data values (int, float) in fields from a section + marked with start and end regexes. Do not read further than + limit regex. + """ + inside = False + lines = [] + for line in _pre_gen(self, extra): + if limit and re.search(limit, line): + break + if re.search(start, line): + inside = True + if inside: + if re.search(end, line): + break + lines.append(line.rstrip('\n')) + line = ''.join(lines) + result = [] + for field in fields: + match = re.search(r' %s\s+=\s+(\*+|%s|\d+)' + % (field, _FP_RE), line) + if match: + value = match.group(1) + # FIXME: assumes fields are only integers or floats + if '*' in value: # Fortran format overflow + result.append(float('Inf') ) + # NOTE: check if this is a sufficient test for int + elif '.' not in value and re.search(r'\d+', value): + result.append(int(value)) + else: + result.append(float(value)) + else: # section may be incomplete + result.append(None) + return result + def __iter__(self): + return self + def next(self): + """Read next line of the filehandle and check for EOF.""" + self.lineno += 1 + curr_pos = self.fileh.tell() + if curr_pos == self.filesize: + raise StopIteration + # NOTE: can't mix next() with seek() + return self.fileh.readline() + def close(self): + """Close the filehandle.""" + self.fileh.close() + def __enter__(self): + return self + def __exit__(self, typ, value, traceback): + self.close() + +class FEData(object): + """A simple struct container to collect data from individual files.""" + + #__slots__ = ['clambda', 't0', 'dt', 'T', 'gradients', + # 'component_gradients', 'mbar_energies'] + __slots__ = ['clambda', 't0', 'dt', 'T', 'gradients', + 'component_gradients'] + + def __init__(self): + self.clambda = -1.0 + self.t0 = -1.0 + self.dt = -1.0 + self.T = -1.0 + self.gradients = [] + self.component_gradients = [] + #self.mbar_energies = [] + + +def file_validation(outfile, ): + invalid = False + with SectionParser(outfile) as secp: + line = secp.skip_lines(5) + if not line: + print(' WARNING: file does not contain any useful data, ' + 'ignoring file') + invalid = True + if not secp.skip_after('^ 2. CONTROL DATA FOR THE RUN'): + print(' WARNING: no CONTROL DATA found, ignoring file') + invalid = True + ntpr, = secp.extract_section('^Nature and format of output:', '^$', + ['ntpr']) + nstlim, dt = secp.extract_section('Molecular dynamics:', '^$', + ['nstlim', 'dt']) + T, = secp.extract_section('temperature regulation:', '^$', + ['temp0']) + if not T: + raise SystemExit('ERROR: Non-constant temperature MD not ' + 'currently supported') + invalid = True + clambda, = secp.extract_section('^Free energy options:', '^$', + ['clambda'], '^---') + if clambda is None: + print(' WARNING: no free energy section found, ignoring file') + invalid = True + + if not secp.skip_after('^ 3. ATOMIC '): + print(' WARNING: no ATOMIC section found, ignoring file\n') + invalid = True + + t0, = secp.extract_section('^ begin time', '^$', ['coords']) + if not secp.skip_after('^ 4. RESULTS'): + print(' WARNING: no RESULTS section found, ignoring file\n') + invalid = True + if invalid: + return False + else: + file_datum = FEData() + file_datum.clambda = clambda + file_datum.t0 = t0 + file_datum.dt = dt + file_datum.T = T + return file_datum + +def extract_dHdl(outfile, ): + file_datum = file_validation(outfile) + if file_validation(outfile): + finished = False + comps = [] + with SectionParser(outfile) as secp: + line = secp.skip_lines(5) + nensec = 0 + nenav = 0 + old_nstep = -1 + old_comp_nstep = -1 + high_E_cnt = 0 + + in_comps = False + for line in secp: + if 'DV/DL, AVERAGES OVER' in line: + in_comps = True + if line.startswith(' NSTEP'): + if in_comps: + #CHECK the result + result = secp.extract_section('^ NSTEP', '^ ---', + ['NSTEP'] + DVDL_COMPS, + extra=line) + if result[0] != old_comp_nstep and not any_none(result): + comps.append([float(E) for E in result[1:]]) + nenav += 1 + old_comp_nstep = result[0] + in_comps = False + else: + nstep, dvdl = secp.extract_section('^ NSTEP', '^ ---', + ['NSTEP', 'DV/DL'], + extra=line) + if nstep != old_nstep and dvdl is not None \ + and nstep is not None: + file_datum.gradients.append(dvdl) + nensec += 1 + old_nstep = nstep + if line == ' 5. TIMINGS\n': + finished = True + break + if not finished: + print(' WARNING: prematurely terminated run') + if not nensec: + print(' WARNING: File %s does not contain any DV/DL data\n' % + outfile) + print('%i data points, %i DV/DL averages' % (nensec, nenav)) + #at this step we get info stored in the FEData object for a given amber out file + file_datum.component_gradients.extend(comps) + #convert file_datum to the pandas format to make it identical to alchemlyb output format + df = convert_to_pandas(file_datum) + else: + df = None + return df + +#currently just check the code with a simple amber ti output file +#likely to switch to the alchmetest frame with more testing cases +if ("__main__") == (__name__): + dataset = "./amber_dataset/ti-0.00.out" + df = extract_dHdl(dataset) + print "Check the df", df From 915062d085f6a2290ad692383e1fd9ac6443580f Mon Sep 17 00:00:00 2001 From: shuai Date: Tue, 31 Oct 2017 14:27:05 -0400 Subject: [PATCH 03/14] add test code to test amber TI parser --- src/alchemlyb/tests/parsing/test_amber.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 src/alchemlyb/tests/parsing/test_amber.py diff --git a/src/alchemlyb/tests/parsing/test_amber.py b/src/alchemlyb/tests/parsing/test_amber.py new file mode 100644 index 00000000..23f866e8 --- /dev/null +++ b/src/alchemlyb/tests/parsing/test_amber.py @@ -0,0 +1,23 @@ +"""Amber parser tests. + +""" + +import sys +sys.path.insert(0, "/home/shuai/Desktop/alchemlyb/alchemlyb/src") +from alchemlyb.parsing.amber import extract_dHdl +from alchemtest.amber import load_simplesolvated + + +def test_dHdl(): + """Test that dHdl has the correct form when extracted from files. + + """ + dataset = load_simplesolvated() + + for leg in dataset['data']: + for filename in dataset['data'][leg]: + dHdl = extract_dHdl(filename,) + + assert dHdl.index.names == ['time', 'lambdas'] + assert dHdl.shape == (500, 1) + From b74c89cc830e02a2e3431d7d29e1f03f537e9b0d Mon Sep 17 00:00:00 2001 From: shuai Date: Tue, 31 Oct 2017 14:36:38 -0400 Subject: [PATCH 04/14] fix a path issue --- src/alchemlyb/tests/parsing/test_amber.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/alchemlyb/tests/parsing/test_amber.py b/src/alchemlyb/tests/parsing/test_amber.py index 23f866e8..ad762be3 100644 --- a/src/alchemlyb/tests/parsing/test_amber.py +++ b/src/alchemlyb/tests/parsing/test_amber.py @@ -2,8 +2,6 @@ """ -import sys -sys.path.insert(0, "/home/shuai/Desktop/alchemlyb/alchemlyb/src") from alchemlyb.parsing.amber import extract_dHdl from alchemtest.amber import load_simplesolvated From 208705bebf2e2eca7b5cf421c535b80c65624a93 Mon Sep 17 00:00:00 2001 From: shuai Date: Thu, 2 Nov 2017 14:57:03 -0400 Subject: [PATCH 05/14] add amber ti estimator test code --- .../tests/test_ti_estimators_amber.py | 44 +++++++++++++++++++ 1 file changed, 44 insertions(+) create mode 100644 src/alchemlyb/tests/test_ti_estimators_amber.py diff --git a/src/alchemlyb/tests/test_ti_estimators_amber.py b/src/alchemlyb/tests/test_ti_estimators_amber.py new file mode 100644 index 00000000..8e7dc02b --- /dev/null +++ b/src/alchemlyb/tests/test_ti_estimators_amber.py @@ -0,0 +1,44 @@ +"""Tests for all TI-based estimators in ``alchemlyb``. + +""" +import pytest + +import pandas as pd + +from alchemlyb.parsing import amber +from alchemlyb.estimators import TI +import alchemtest.amber + + +def amber_simplesolvated_charge_dHdl(): + dataset = alchemtest.amber.load_simplesolvated() + + dHdl = pd.concat([amber.extract_dHdl(filename) + for filename in dataset['data']['charge']]) + + return dHdl + +def amber_simplesolvated_vdw_dHdl(): + dataset = alchemtest.amber.load_simplesolvated() + + dHdl = pd.concat([amber.extract_dHdl(filename) + for filename in dataset['data']['vdw']]) + + return dHdl + + +class TIestimatorMixin: + + @pytest.mark.parametrize('X_delta_f', ((amber_simplesolvated_charge_dHdl(), -60.114), + (amber_simplesolvated_vdw_dHdl(), 3.824))) + def test_get_delta_f(self, X_delta_f): + est = self.cls().fit(X_delta_f[0]) + delta_f = est.delta_f_.iloc[0, -1] + assert X_delta_f[1] == pytest.approx(delta_f, rel=1e-3) + +class TestTI(TIestimatorMixin): + """Tests for TI. + + """ + cls = TI + From 715f3ee0b352e3003f45788f8988bc4fb17eb9b8 Mon Sep 17 00:00:00 2001 From: shuai Date: Thu, 2 Nov 2017 15:03:14 -0400 Subject: [PATCH 06/14] switch the subsampling to the original version --- src/alchemlyb/preprocessing/subsampling.py | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) diff --git a/src/alchemlyb/preprocessing/subsampling.py b/src/alchemlyb/preprocessing/subsampling.py index cc6f201e..50703286 100644 --- a/src/alchemlyb/preprocessing/subsampling.py +++ b/src/alchemlyb/preprocessing/subsampling.py @@ -4,7 +4,6 @@ import numpy as np from pymbar.timeseries import statisticalInefficiency from pymbar.timeseries import detectEquilibration -from pymbar.timeseries import subsampleCorrelatedData def _check_multiple_times(df): @@ -98,22 +97,14 @@ def statistical_inefficiency(df, series=None, lower=None, upper=None, step=None) # calculate statistical inefficiency of series statinef = statisticalInefficiency(series) - - #use the subsampleCorrelatedData function to subsample the data - indices = subsampleCorrelatedData(series, g=statinef) - picked_time_index = [] - #pick the time index for the pandas dataframe based on the python indices from subsample - for s_index, s_index_pair in enumerate(series.index): - if s_index in indices: - picked_time_index.append(s_index_pair[0]) # we round up - #statinef = int(np.rint(statinef)) + statinef = int(np.rint(statinef)) + # subsample according to statistical inefficiency - #series = series.iloc[::statinef] + series = series.iloc[::statinef] - #df = df.loc[series.index] - df = df.loc[picked_time_index] + df = df.loc[series.index] else: df = slicing(df, lower=lower, upper=upper, step=step) From f64722de49b75136158b2ad6f2443166b0b0090e Mon Sep 17 00:00:00 2001 From: shuai Date: Thu, 2 Nov 2017 15:04:29 -0400 Subject: [PATCH 07/14] clean up the amber parser, add logging info etc --- src/alchemlyb/parsing/amber.py | 169 ++++++++++++++++++--------------- 1 file changed, 93 insertions(+), 76 deletions(-) diff --git a/src/alchemlyb/parsing/amber.py b/src/alchemlyb/parsing/amber.py index bfae1ae6..1604170c 100644 --- a/src/alchemlyb/parsing/amber.py +++ b/src/alchemlyb/parsing/amber.py @@ -3,12 +3,16 @@ Change the final format to pandas to be consistent with the alchemlyb format """ -import pandas as pd +import os import re +import pandas as pd import numpy as np -import os +import logging + +logger = logging.getLogger("alchemlyb.parsers.Amber") -def convert_to_pandas(file_datum, ): +def convert_to_pandas(file_datum): + """Convert the data structure from numpy to pandas format""" data_dic = {} data_dic["dHdl"] = [] data_dic["lambdas"] = [] @@ -68,9 +72,10 @@ def __init__(self, filename): try: self.fileh = open_it(self.filename, 'rb') self.filesize = os.stat(self.filename).st_size - except IOError: - raise SystemExit('ERROR: cannot open file %s' % filename) + except Exception as ex: + logging.exception("ERROR: cannot open file %s" % filename) self.lineno = 0 + def skip_lines(self, nlines): """Skip a given number of files.""" lineno = 0 @@ -79,6 +84,7 @@ def skip_lines(self, nlines): if lineno > nlines: return line return None + def skip_after(self, pattern): """Skip until after a line that matches a regex pattern.""" for line in self: @@ -86,6 +92,7 @@ def skip_after(self, pattern): if match: break return self.fileh.tell() != self.filesize + def extract_section(self, start, end, fields, limit=None, extra='', debug=False): """ @@ -122,8 +129,10 @@ def extract_section(self, start, end, fields, limit=None, extra='', else: # section may be incomplete result.append(None) return result + def __iter__(self): return self + def next(self): """Read next line of the filehandle and check for EOF.""" self.lineno += 1 @@ -132,19 +141,20 @@ def next(self): raise StopIteration # NOTE: can't mix next() with seek() return self.fileh.readline() + def close(self): """Close the filehandle.""" self.fileh.close() + def __enter__(self): return self + def __exit__(self, typ, value, traceback): self.close() class FEData(object): """A simple struct container to collect data from individual files.""" - #__slots__ = ['clambda', 't0', 'dt', 'T', 'gradients', - # 'component_gradients', 'mbar_energies'] __slots__ = ['clambda', 't0', 'dt', 'T', 'gradients', 'component_gradients'] @@ -155,19 +165,18 @@ def __init__(self): self.T = -1.0 self.gradients = [] self.component_gradients = [] - #self.mbar_energies = [] - -def file_validation(outfile, ): +def file_validation(outfile): + """validate the energy output file """ invalid = False with SectionParser(outfile) as secp: line = secp.skip_lines(5) if not line: - print(' WARNING: file does not contain any useful data, ' - 'ignoring file') + logging.warning(' WARNING: file does not contain any useful data, ' + 'ignoring file') invalid = True if not secp.skip_after('^ 2. CONTROL DATA FOR THE RUN'): - print(' WARNING: no CONTROL DATA found, ignoring file') + logging.warning(' WARNING: no CONTROL DATA found, ignoring file') invalid = True ntpr, = secp.extract_section('^Nature and format of output:', '^$', ['ntpr']) @@ -176,85 +185,93 @@ def file_validation(outfile, ): T, = secp.extract_section('temperature regulation:', '^$', ['temp0']) if not T: - raise SystemExit('ERROR: Non-constant temperature MD not ' - 'currently supported') + logging.error('ERROR: Non-constant temperature MD not ' + 'currently supported') invalid = True clambda, = secp.extract_section('^Free energy options:', '^$', ['clambda'], '^---') if clambda is None: - print(' WARNING: no free energy section found, ignoring file') + logging.warning(' WARNING: no free energy section found, ignoring file') invalid = True if not secp.skip_after('^ 3. ATOMIC '): - print(' WARNING: no ATOMIC section found, ignoring file\n') + logging.warning(' WARNING: no ATOMIC section found, ignoring file\n') invalid = True t0, = secp.extract_section('^ begin time', '^$', ['coords']) if not secp.skip_after('^ 4. RESULTS'): - print(' WARNING: no RESULTS section found, ignoring file\n') + logging.warning(' WARNING: no RESULTS section found, ignoring file\n') invalid = True if invalid: return False - else: - file_datum = FEData() - file_datum.clambda = clambda - file_datum.t0 = t0 - file_datum.dt = dt - file_datum.T = T - return file_datum + file_datum = FEData() + file_datum.clambda = clambda + file_datum.t0 = t0 + file_datum.dt = dt + file_datum.T = T + return file_datum -def extract_dHdl(outfile, ): - file_datum = file_validation(outfile) - if file_validation(outfile): - finished = False - comps = [] - with SectionParser(outfile) as secp: - line = secp.skip_lines(5) - nensec = 0 - nenav = 0 - old_nstep = -1 - old_comp_nstep = -1 - high_E_cnt = 0 +def extract_dHdl(outfile): + """Return gradients `dH/dl` from Amebr TI outputfile + Parameters + ---------- + outfile : str + Path to Amber .out file to extract data from. - in_comps = False - for line in secp: - if 'DV/DL, AVERAGES OVER' in line: - in_comps = True - if line.startswith(' NSTEP'): - if in_comps: - #CHECK the result - result = secp.extract_section('^ NSTEP', '^ ---', - ['NSTEP'] + DVDL_COMPS, - extra=line) - if result[0] != old_comp_nstep and not any_none(result): - comps.append([float(E) for E in result[1:]]) - nenav += 1 - old_comp_nstep = result[0] - in_comps = False - else: - nstep, dvdl = secp.extract_section('^ NSTEP', '^ ---', - ['NSTEP', 'DV/DL'], - extra=line) - if nstep != old_nstep and dvdl is not None \ - and nstep is not None: - file_datum.gradients.append(dvdl) - nensec += 1 - old_nstep = nstep - if line == ' 5. TIMINGS\n': - finished = True - break - if not finished: - print(' WARNING: prematurely terminated run') - if not nensec: - print(' WARNING: File %s does not contain any DV/DL data\n' % - outfile) - print('%i data points, %i DV/DL averages' % (nensec, nenav)) - #at this step we get info stored in the FEData object for a given amber out file - file_datum.component_gradients.extend(comps) - #convert file_datum to the pandas format to make it identical to alchemlyb output format - df = convert_to_pandas(file_datum) - else: - df = None + Returns + ------- + dH/dl : Series + dH/dl as a function of time for this lambda window. + """ + file_datum = file_validation(outfile) + if not file_validation(outfile): + return None + finished = False + comps = [] + with SectionParser(outfile) as secp: + line = secp.skip_lines(5) + nensec = 0 + nenav = 0 + old_nstep = -1 + old_comp_nstep = -1 + high_E_cnt = 0 + in_comps = False + for line in secp: + if 'DV/DL, AVERAGES OVER' in line: + in_comps = True + if line.startswith(' NSTEP'): + if in_comps: + #CHECK the result + result = secp.extract_section('^ NSTEP', '^ ---', + ['NSTEP'] + DVDL_COMPS, + extra=line) + if result[0] != old_comp_nstep and not any_none(result): + comps.append([float(E) for E in result[1:]]) + nenav += 1 + old_comp_nstep = result[0] + in_comps = False + else: + nstep, dvdl = secp.extract_section('^ NSTEP', '^ ---', + ['NSTEP', 'DV/DL'], + extra=line) + if nstep != old_nstep and dvdl is not None \ + and nstep is not None: + file_datum.gradients.append(dvdl) + nensec += 1 + old_nstep = nstep + if line == ' 5. TIMINGS\n': + finished = True + break + if not finished: + logging.warning(' WARNING: prematurely terminated run') + if not nensec: + logging.warning(' WARNING: File %s does not contain any DV/DL data\n' % + outfile) + logging.info('%i data points, %i DV/DL averages' % (nensec, nenav)) + #at this step we get info stored in the FEData object for a given amber out file + file_datum.component_gradients.extend(comps) + #convert file_datum to the pandas format to make it identical to alchemlyb output format + df = convert_to_pandas(file_datum) return df #currently just check the code with a simple amber ti output file From dcf997bdc561988c64efadd1ff94337d8efa4f76 Mon Sep 17 00:00:00 2001 From: shuai Date: Fri, 3 Nov 2017 09:42:48 -0400 Subject: [PATCH 08/14] remove the main test function --- src/alchemlyb/parsing/amber.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/src/alchemlyb/parsing/amber.py b/src/alchemlyb/parsing/amber.py index 1604170c..6441eddf 100644 --- a/src/alchemlyb/parsing/amber.py +++ b/src/alchemlyb/parsing/amber.py @@ -273,10 +273,3 @@ def extract_dHdl(outfile): #convert file_datum to the pandas format to make it identical to alchemlyb output format df = convert_to_pandas(file_datum) return df - -#currently just check the code with a simple amber ti output file -#likely to switch to the alchmetest frame with more testing cases -if ("__main__") == (__name__): - dataset = "./amber_dataset/ti-0.00.out" - df = extract_dHdl(dataset) - print "Check the df", df From 3c299573a0d536929fa216bfa43574e03fcdf097 Mon Sep 17 00:00:00 2001 From: shuai Date: Fri, 3 Nov 2017 12:17:19 -0400 Subject: [PATCH 09/14] change to amber parser to be compatible with python 3 --- src/alchemlyb/parsing/amber.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/alchemlyb/parsing/amber.py b/src/alchemlyb/parsing/amber.py index 6441eddf..2b24ba84 100644 --- a/src/alchemlyb/parsing/amber.py +++ b/src/alchemlyb/parsing/amber.py @@ -61,7 +61,7 @@ class SectionParser(object): def __init__(self, filename): """Opens a file according to its file type.""" self.filename = filename - with open(filename, 'rb') as f: + with open(filename, 'r') as f: magic = f.read(3) # NOTE: works because all 3-byte headers try: method = _MAGIC_CMPR[magic] @@ -70,7 +70,7 @@ def __init__(self, filename): else: open_it = getattr(__import__(method[0]), method[1]) try: - self.fileh = open_it(self.filename, 'rb') + self.fileh = open_it(self.filename, 'r') self.filesize = os.stat(self.filename).st_size except Exception as ex: logging.exception("ERROR: cannot open file %s" % filename) @@ -141,6 +141,8 @@ def next(self): raise StopIteration # NOTE: can't mix next() with seek() return self.fileh.readline() + #make compatible with python 3.6 + __next__ = next def close(self): """Close the filehandle.""" @@ -184,6 +186,7 @@ def file_validation(outfile): ['nstlim', 'dt']) T, = secp.extract_section('temperature regulation:', '^$', ['temp0']) + print ("Check the temperature", T) if not T: logging.error('ERROR: Non-constant temperature MD not ' 'currently supported') From a0e1a5bbc49b79e6f400a5f6fd6b101bf47ce20f Mon Sep 17 00:00:00 2001 From: shuai Date: Fri, 3 Nov 2017 12:34:58 -0400 Subject: [PATCH 10/14] clean up a print line --- src/alchemlyb/parsing/amber.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/alchemlyb/parsing/amber.py b/src/alchemlyb/parsing/amber.py index 2b24ba84..a2479787 100644 --- a/src/alchemlyb/parsing/amber.py +++ b/src/alchemlyb/parsing/amber.py @@ -186,7 +186,6 @@ def file_validation(outfile): ['nstlim', 'dt']) T, = secp.extract_section('temperature regulation:', '^$', ['temp0']) - print ("Check the temperature", T) if not T: logging.error('ERROR: Non-constant temperature MD not ' 'currently supported') From 3a7586675ed6e0a2ae1732215ec6fc8d702086af Mon Sep 17 00:00:00 2001 From: Shuai Liu Date: Mon, 13 Nov 2017 00:33:40 -0800 Subject: [PATCH 11/14] add amber file validation test and switch the file opener to anyopen --- src/alchemlyb/parsing/amber.py | 11 ++--------- src/alchemlyb/tests/parsing/test_amber.py | 12 +++++++++++- 2 files changed, 13 insertions(+), 10 deletions(-) diff --git a/src/alchemlyb/parsing/amber.py b/src/alchemlyb/parsing/amber.py index a2479787..5b8a1e5b 100644 --- a/src/alchemlyb/parsing/amber.py +++ b/src/alchemlyb/parsing/amber.py @@ -8,6 +8,7 @@ import pandas as pd import numpy as np import logging +from .util import anyopen logger = logging.getLogger("alchemlyb.parsers.Amber") @@ -61,16 +62,8 @@ class SectionParser(object): def __init__(self, filename): """Opens a file according to its file type.""" self.filename = filename - with open(filename, 'r') as f: - magic = f.read(3) # NOTE: works because all 3-byte headers try: - method = _MAGIC_CMPR[magic] - except KeyError: - open_it = open - else: - open_it = getattr(__import__(method[0]), method[1]) - try: - self.fileh = open_it(self.filename, 'r') + self.fileh = anyopen(self.filename, 'r') self.filesize = os.stat(self.filename).st_size except Exception as ex: logging.exception("ERROR: cannot open file %s" % filename) diff --git a/src/alchemlyb/tests/parsing/test_amber.py b/src/alchemlyb/tests/parsing/test_amber.py index ad762be3..400dac46 100644 --- a/src/alchemlyb/tests/parsing/test_amber.py +++ b/src/alchemlyb/tests/parsing/test_amber.py @@ -1,9 +1,10 @@ """Amber parser tests. """ - from alchemlyb.parsing.amber import extract_dHdl +from alchemlyb.parsing.amber import file_validation from alchemtest.amber import load_simplesolvated +from alchemtest.amber import load_invalidfiles def test_dHdl(): @@ -19,3 +20,12 @@ def test_dHdl(): assert dHdl.index.names == ['time', 'lambdas'] assert dHdl.shape == (500, 1) +def test_invalidfiles(): + """Test the file validation function to ensure the function returning False if the file is invalid + + """ + invalid_files = load_invalidfiles() + + for invalid_file_list in invalid_files['data']: + for invalid_file in invalid_file_list: + assert file_validation(invalid_file) == False From 46eb7e9ba12680c9b9c4e31f783bad26124337ea Mon Sep 17 00:00:00 2001 From: Shuai Liu Date: Mon, 13 Nov 2017 10:19:13 -0800 Subject: [PATCH 12/14] change the EOF detection/skip_after function for ziped file, fix some other minor issues --- src/alchemlyb/parsing/amber.py | 24 +++++++++--------------- 1 file changed, 9 insertions(+), 15 deletions(-) diff --git a/src/alchemlyb/parsing/amber.py b/src/alchemlyb/parsing/amber.py index 5b8a1e5b..cf9cafac 100644 --- a/src/alchemlyb/parsing/amber.py +++ b/src/alchemlyb/parsing/amber.py @@ -5,10 +5,11 @@ import os import re +import logging import pandas as pd import numpy as np -import logging -from .util import anyopen + +from util import anyopen logger = logging.getLogger("alchemlyb.parsers.Amber") @@ -32,10 +33,6 @@ def convert_to_pandas(file_datum): DVDL_COMPS = ['BOND', 'ANGLE', 'DIHED', '1-4 NB', '1-4 EEL', 'VDWAALS', 'EELEC', 'RESTRAINT'] _FP_RE = r'[+-]?(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?' -_MAGIC_CMPR = { - '\x1f\x8b\x08': ('gzip', 'GzipFile'), # last byte is compression method - '\x42\x5a\x68': ('bz2', 'BZ2File') -} def any_none(sequence): """Check if any element of a sequence is None.""" @@ -53,7 +50,7 @@ def _pre_gen(it, first): yield first while it: - yield it.next() + yield next(it) class SectionParser(object): """ @@ -64,7 +61,6 @@ def __init__(self, filename): self.filename = filename try: self.fileh = anyopen(self.filename, 'r') - self.filesize = os.stat(self.filename).st_size except Exception as ex: logging.exception("ERROR: cannot open file %s" % filename) self.lineno = 0 @@ -80,11 +76,13 @@ def skip_lines(self, nlines): def skip_after(self, pattern): """Skip until after a line that matches a regex pattern.""" + Found_pattern = False for line in self: match = re.search(pattern, line) if match: + Found_pattern = True break - return self.fileh.tell() != self.filesize + return Found_pattern def extract_section(self, start, end, fields, limit=None, extra='', debug=False): @@ -129,11 +127,7 @@ def __iter__(self): def next(self): """Read next line of the filehandle and check for EOF.""" self.lineno += 1 - curr_pos = self.fileh.tell() - if curr_pos == self.filesize: - raise StopIteration - # NOTE: can't mix next() with seek() - return self.fileh.readline() + return next(self.fileh) #make compatible with python 3.6 __next__ = next @@ -207,7 +201,7 @@ def file_validation(outfile): return file_datum def extract_dHdl(outfile): - """Return gradients `dH/dl` from Amebr TI outputfile + """Return gradients `dH/dl` from Amber TI outputfile Parameters ---------- outfile : str From 49cc45cf3f310e1a6eff1d4c15f6fa2c17adbd09 Mon Sep 17 00:00:00 2001 From: Shuai Liu Date: Mon, 13 Nov 2017 10:40:08 -0800 Subject: [PATCH 13/14] change to .util --- src/alchemlyb/parsing/amber.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/alchemlyb/parsing/amber.py b/src/alchemlyb/parsing/amber.py index cf9cafac..2f47f16a 100644 --- a/src/alchemlyb/parsing/amber.py +++ b/src/alchemlyb/parsing/amber.py @@ -9,7 +9,7 @@ import pandas as pd import numpy as np -from util import anyopen +from .util import anyopen logger = logging.getLogger("alchemlyb.parsers.Amber") From 3bc2c67b8e874ca8b4f099401c329f79d61525f5 Mon Sep 17 00:00:00 2001 From: Shuai Liu Date: Mon, 13 Nov 2017 11:43:13 -0800 Subject: [PATCH 14/14] add test function for any_none function in amber parser --- src/alchemlyb/tests/parsing/test_amber.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/alchemlyb/tests/parsing/test_amber.py b/src/alchemlyb/tests/parsing/test_amber.py index 400dac46..c293ec05 100644 --- a/src/alchemlyb/tests/parsing/test_amber.py +++ b/src/alchemlyb/tests/parsing/test_amber.py @@ -3,6 +3,7 @@ """ from alchemlyb.parsing.amber import extract_dHdl from alchemlyb.parsing.amber import file_validation +from alchemlyb.parsing.amber import any_none from alchemtest.amber import load_simplesolvated from alchemtest.amber import load_invalidfiles @@ -29,3 +30,9 @@ def test_invalidfiles(): for invalid_file_list in invalid_files['data']: for invalid_file in invalid_file_list: assert file_validation(invalid_file) == False + +def test_any_none(): + """Test the any None function to ensure if the None value will be caught + """ + None_value_result = [150000, None, None, None, None, None, None, None, None] + assert any_none(None_value_result) == True