From 91a6d03def7c91ac54d0b2f596ab7354fcdb8cbc Mon Sep 17 00:00:00 2001
From: shuai <shuailiu25@gmail.com>
Date: Mon, 30 Oct 2017 11:12:54 -0400
Subject: [PATCH 01/14] switch to use subsampleCorrelatedData in pymbar to get
 the indices of the subsampled dataset

---
 src/alchemlyb/preprocessing/subsampling.py | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/src/alchemlyb/preprocessing/subsampling.py b/src/alchemlyb/preprocessing/subsampling.py
index 50703286..cc6f201e 100644
--- a/src/alchemlyb/preprocessing/subsampling.py
+++ b/src/alchemlyb/preprocessing/subsampling.py
@@ -4,6 +4,7 @@
 import numpy as np
 from pymbar.timeseries import statisticalInefficiency
 from pymbar.timeseries import detectEquilibration
+from pymbar.timeseries import subsampleCorrelatedData
 
 
 def _check_multiple_times(df):
@@ -97,14 +98,22 @@ def statistical_inefficiency(df, series=None, lower=None, upper=None, step=None)
 
         # calculate statistical inefficiency of series
         statinef  = statisticalInefficiency(series)
+        
+        #use the subsampleCorrelatedData function to subsample the data
+        indices = subsampleCorrelatedData(series, g=statinef)
+        picked_time_index = []
+        #pick the time index for the pandas dataframe based on the python indices from subsample
+        for s_index, s_index_pair in enumerate(series.index):
+            if s_index in indices:
+                picked_time_index.append(s_index_pair[0])
 
         # we round up
-        statinef = int(np.rint(statinef))
-
+        #statinef = int(np.rint(statinef))
         # subsample according to statistical inefficiency
-        series = series.iloc[::statinef]
+        #series = series.iloc[::statinef]
 
-        df = df.loc[series.index]
+        #df = df.loc[series.index]
+        df = df.loc[picked_time_index]
     else:
         df = slicing(df, lower=lower, upper=upper, step=step)
     

From 05fbdf0486533cd7db47e2cd777ed541699ce054 Mon Sep 17 00:00:00 2001
From: shuai <shuailiu25@gmail.com>
Date: Mon, 30 Oct 2017 11:19:24 -0400
Subject: [PATCH 02/14] add the amber TI parser to get dhdl

---
 src/alchemlyb/parsing/amber.py | 265 +++++++++++++++++++++++++++++++++
 1 file changed, 265 insertions(+)
 create mode 100644 src/alchemlyb/parsing/amber.py

diff --git a/src/alchemlyb/parsing/amber.py b/src/alchemlyb/parsing/amber.py
new file mode 100644
index 00000000..bfae1ae6
--- /dev/null
+++ b/src/alchemlyb/parsing/amber.py
@@ -0,0 +1,265 @@
+"""Parsers for extracting alchemical data from amber output files.
+Most of the file parsing part are inheriting from alchemical-analysis  
+Change the final format to pandas to be consistent with the alchemlyb format
+"""
+
+import pandas as pd
+import re
+import numpy as np
+import os
+
+def convert_to_pandas(file_datum, ):
+    data_dic = {}
+    data_dic["dHdl"] = []
+    data_dic["lambdas"] = []
+    data_dic["time"] = []
+    for frame_index, frame_dhdl in enumerate(file_datum.gradients):
+        data_dic["dHdl"].append(frame_dhdl)
+        data_dic["lambdas"].append(file_datum.clambda)
+        #here we need to convert dt to ps unit from ns 
+        frame_time = file_datum.t0 + (frame_index + 1) * file_datum.dt*1000
+        data_dic["time"].append(frame_time)
+    df = pd.DataFrame(data_dic["dHdl"], columns=["dHdl"], index =pd.Float64Index(data_dic["time"], name='time'))
+    df["lambdas"] = data_dic["lambdas"][0]
+    df = df.reset_index().set_index(['time'] + ['lambdas'])
+    return df
+
+DVDL_COMPS = ['BOND', 'ANGLE', 'DIHED', '1-4 NB', '1-4 EEL', 'VDWAALS',
+              'EELEC', 'RESTRAINT']
+_FP_RE = r'[+-]?(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?'
+_MAGIC_CMPR = {
+    '\x1f\x8b\x08': ('gzip', 'GzipFile'),  # last byte is compression method
+    '\x42\x5a\x68': ('bz2', 'BZ2File')
+}
+
+def any_none(sequence):
+    """Check if any element of a sequence is None."""
+
+    for element in sequence:
+        if element is None:
+            return True
+
+    return False
+
+def _pre_gen(it, first):
+    """A generator that returns first first if it exists."""
+
+    if first:
+        yield first
+
+    while it:
+        yield it.next()
+
+class SectionParser(object):
+    """
+    A simple parser to extract data values from sections.
+    """
+    def __init__(self, filename):
+        """Opens a file according to its file type."""
+        self.filename = filename
+        with open(filename, 'rb') as f:
+            magic = f.read(3)   # NOTE: works because all 3-byte headers
+        try:
+            method = _MAGIC_CMPR[magic]
+        except KeyError:
+            open_it = open
+        else:
+            open_it = getattr(__import__(method[0]), method[1])
+        try:
+            self.fileh = open_it(self.filename, 'rb')
+            self.filesize = os.stat(self.filename).st_size
+        except IOError:
+            raise SystemExit('ERROR: cannot open file %s' % filename)
+        self.lineno = 0
+    def skip_lines(self, nlines):
+        """Skip a given number of files."""
+        lineno = 0
+        for line in self:
+            lineno += 1
+            if lineno > nlines:
+                return line
+        return None
+    def skip_after(self, pattern):
+        """Skip until after a line that matches a regex pattern."""
+        for line in self:
+            match = re.search(pattern, line)
+            if match:
+                break
+        return self.fileh.tell() != self.filesize
+    def extract_section(self, start, end, fields, limit=None, extra='',
+                        debug=False):
+        """
+        Extract data values (int, float) in fields from a section
+        marked with start and end regexes.  Do not read further than
+        limit regex.
+        """
+        inside = False
+        lines = []
+        for line in _pre_gen(self, extra):
+            if limit and re.search(limit, line):
+                break
+            if re.search(start, line):
+                inside = True
+            if inside:
+                if re.search(end, line):
+                    break
+                lines.append(line.rstrip('\n'))
+        line = ''.join(lines)
+        result = []
+        for field in fields:
+            match = re.search(r' %s\s+=\s+(\*+|%s|\d+)'
+                              % (field, _FP_RE), line)
+            if match:
+                value = match.group(1)
+                # FIXME: assumes fields are only integers or floats
+                if '*' in value:            # Fortran format overflow
+                    result.append(float('Inf') )
+                # NOTE: check if this is a sufficient test for int
+                elif '.' not in value and re.search(r'\d+', value):
+                    result.append(int(value))
+                else:
+                    result.append(float(value))
+            else:                       # section may be incomplete
+                result.append(None)
+        return result
+    def __iter__(self):
+        return self
+    def next(self):
+        """Read next line of the filehandle and check for EOF."""
+        self.lineno += 1
+        curr_pos = self.fileh.tell()
+        if curr_pos == self.filesize:
+            raise StopIteration
+        # NOTE: can't mix next() with seek()
+        return self.fileh.readline()
+    def close(self):
+        """Close the filehandle."""
+        self.fileh.close()
+    def __enter__(self):
+        return self
+    def __exit__(self, typ, value, traceback):
+        self.close()
+
+class FEData(object):
+    """A simple struct container to collect data from individual files."""
+
+    #__slots__ = ['clambda', 't0', 'dt', 'T', 'gradients',
+    #             'component_gradients', 'mbar_energies']
+    __slots__ = ['clambda', 't0', 'dt', 'T', 'gradients',
+                 'component_gradients']
+
+    def __init__(self):
+        self.clambda = -1.0
+        self.t0 = -1.0
+        self.dt = -1.0
+        self.T = -1.0
+        self.gradients = []
+        self.component_gradients = []
+        #self.mbar_energies = []
+
+
+def file_validation(outfile, ):
+    invalid = False
+    with SectionParser(outfile) as secp:
+        line = secp.skip_lines(5) 
+        if not line:
+            print('  WARNING: file does not contain any useful data, '
+                      'ignoring file')
+            invalid = True
+        if not secp.skip_after('^   2.  CONTROL  DATA  FOR  THE  RUN'):
+            print('  WARNING: no CONTROL DATA found, ignoring file')
+            invalid = True
+        ntpr, = secp.extract_section('^Nature and format of output:', '^$',
+                                     ['ntpr'])
+        nstlim, dt = secp.extract_section('Molecular dynamics:', '^$',
+                                          ['nstlim', 'dt'])
+        T, = secp.extract_section('temperature regulation:', '^$',
+                                 ['temp0'])
+        if not T:
+            raise SystemExit('ERROR: Non-constant temperature MD not '
+                             'currently supported')
+            invalid = True
+        clambda, = secp.extract_section('^Free energy options:', '^$',
+                                        ['clambda'], '^---')
+        if clambda is None:
+            print('  WARNING: no free energy section found, ignoring file')
+            invalid = True
+
+        if not secp.skip_after('^   3.  ATOMIC '):
+            print('  WARNING: no ATOMIC section found, ignoring file\n')
+            invalid = True
+
+        t0, = secp.extract_section('^ begin time', '^$', ['coords'])
+        if not secp.skip_after('^   4.  RESULTS'):
+            print('  WARNING: no RESULTS section found, ignoring file\n')
+            invalid = True
+    if invalid:
+        return False
+    else:
+        file_datum = FEData()
+        file_datum.clambda = clambda
+        file_datum.t0 = t0
+        file_datum.dt = dt
+        file_datum.T = T
+        return file_datum
+
+def extract_dHdl(outfile, ):
+    file_datum = file_validation(outfile)
+    if file_validation(outfile):
+        finished = False
+        comps = []
+        with SectionParser(outfile) as secp:
+            line = secp.skip_lines(5)
+            nensec = 0
+            nenav = 0
+            old_nstep = -1
+            old_comp_nstep = -1
+            high_E_cnt = 0
+
+            in_comps = False
+            for line in secp:
+                if 'DV/DL, AVERAGES OVER' in line:
+                    in_comps = True
+                if line.startswith(' NSTEP'):
+                    if in_comps:
+                        #CHECK the result
+                        result = secp.extract_section('^ NSTEP', '^ ---',
+                                                     ['NSTEP'] + DVDL_COMPS,
+                                                     extra=line)
+                        if result[0] != old_comp_nstep and not any_none(result):
+                            comps.append([float(E) for E in result[1:]])
+                            nenav += 1  
+                            old_comp_nstep = result[0]
+                        in_comps = False
+                    else:
+                        nstep, dvdl = secp.extract_section('^ NSTEP', '^ ---',
+                                                           ['NSTEP', 'DV/DL'],
+                                                           extra=line)
+                        if nstep != old_nstep and dvdl is not None \
+                                and nstep is not None:
+                            file_datum.gradients.append(dvdl)
+                            nensec += 1
+                            old_nstep = nstep
+                if line == '   5.  TIMINGS\n':
+                    finished = True
+                    break
+        if not finished:
+            print('  WARNING: prematurely terminated run')
+        if not nensec:
+            print('  WARNING: File %s does not contain any DV/DL data\n' %
+                  outfile)
+        print('%i data points, %i DV/DL averages' % (nensec, nenav))
+        #at this step we get info stored in the FEData object for a given amber out file
+        file_datum.component_gradients.extend(comps)
+        #convert file_datum to the pandas format to make it identical to alchemlyb output format
+        df = convert_to_pandas(file_datum)        
+    else:
+        df = None
+    return df
+
+#currently just check the code with a simple amber ti output file
+#likely to switch to the alchmetest frame with more testing cases 
+if ("__main__") == (__name__):
+    dataset = "./amber_dataset/ti-0.00.out"
+    df = extract_dHdl(dataset)
+    print "Check the df", df

From 915062d085f6a2290ad692383e1fd9ac6443580f Mon Sep 17 00:00:00 2001
From: shuai <shuailiu25@gmail.com>
Date: Tue, 31 Oct 2017 14:27:05 -0400
Subject: [PATCH 03/14] add test code to test amber TI parser

---
 src/alchemlyb/tests/parsing/test_amber.py | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)
 create mode 100644 src/alchemlyb/tests/parsing/test_amber.py

diff --git a/src/alchemlyb/tests/parsing/test_amber.py b/src/alchemlyb/tests/parsing/test_amber.py
new file mode 100644
index 00000000..23f866e8
--- /dev/null
+++ b/src/alchemlyb/tests/parsing/test_amber.py
@@ -0,0 +1,23 @@
+"""Amber parser tests.
+
+"""
+
+import sys
+sys.path.insert(0, "/home/shuai/Desktop/alchemlyb/alchemlyb/src")
+from alchemlyb.parsing.amber import extract_dHdl
+from alchemtest.amber import load_simplesolvated
+
+
+def test_dHdl():
+    """Test that dHdl has the correct form when extracted from files.
+
+    """
+    dataset = load_simplesolvated()
+
+    for leg in dataset['data']:
+        for filename in dataset['data'][leg]:
+            dHdl = extract_dHdl(filename,)
+
+            assert dHdl.index.names == ['time', 'lambdas']
+            assert dHdl.shape == (500, 1)
+

From b74c89cc830e02a2e3431d7d29e1f03f537e9b0d Mon Sep 17 00:00:00 2001
From: shuai <shuailiu25@gmail.com>
Date: Tue, 31 Oct 2017 14:36:38 -0400
Subject: [PATCH 04/14] fix a path issue

---
 src/alchemlyb/tests/parsing/test_amber.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/alchemlyb/tests/parsing/test_amber.py b/src/alchemlyb/tests/parsing/test_amber.py
index 23f866e8..ad762be3 100644
--- a/src/alchemlyb/tests/parsing/test_amber.py
+++ b/src/alchemlyb/tests/parsing/test_amber.py
@@ -2,8 +2,6 @@
 
 """
 
-import sys
-sys.path.insert(0, "/home/shuai/Desktop/alchemlyb/alchemlyb/src")
 from alchemlyb.parsing.amber import extract_dHdl
 from alchemtest.amber import load_simplesolvated
 

From 208705bebf2e2eca7b5cf421c535b80c65624a93 Mon Sep 17 00:00:00 2001
From: shuai <shuailiu25@gmail.com>
Date: Thu, 2 Nov 2017 14:57:03 -0400
Subject: [PATCH 05/14] add amber ti estimator test code

---
 .../tests/test_ti_estimators_amber.py         | 44 +++++++++++++++++++
 1 file changed, 44 insertions(+)
 create mode 100644 src/alchemlyb/tests/test_ti_estimators_amber.py

diff --git a/src/alchemlyb/tests/test_ti_estimators_amber.py b/src/alchemlyb/tests/test_ti_estimators_amber.py
new file mode 100644
index 00000000..8e7dc02b
--- /dev/null
+++ b/src/alchemlyb/tests/test_ti_estimators_amber.py
@@ -0,0 +1,44 @@
+"""Tests for all TI-based estimators in ``alchemlyb``.
+
+"""
+import pytest
+
+import pandas as pd
+
+from alchemlyb.parsing import amber 
+from alchemlyb.estimators import TI
+import alchemtest.amber
+
+
+def amber_simplesolvated_charge_dHdl():
+    dataset = alchemtest.amber.load_simplesolvated()
+
+    dHdl = pd.concat([amber.extract_dHdl(filename)
+                      for filename in dataset['data']['charge']])
+
+    return dHdl
+
+def amber_simplesolvated_vdw_dHdl():
+    dataset = alchemtest.amber.load_simplesolvated()
+
+    dHdl = pd.concat([amber.extract_dHdl(filename)
+                      for filename in dataset['data']['vdw']])
+
+    return dHdl
+
+
+class TIestimatorMixin:
+
+    @pytest.mark.parametrize('X_delta_f', ((amber_simplesolvated_charge_dHdl(), -60.114),
+                                           (amber_simplesolvated_vdw_dHdl(), 3.824)))
+    def test_get_delta_f(self, X_delta_f):
+        est = self.cls().fit(X_delta_f[0])
+        delta_f = est.delta_f_.iloc[0, -1]
+        assert X_delta_f[1] == pytest.approx(delta_f, rel=1e-3)
+
+class TestTI(TIestimatorMixin):
+    """Tests for TI.
+
+    """
+    cls = TI 
+

From 715f3ee0b352e3003f45788f8988bc4fb17eb9b8 Mon Sep 17 00:00:00 2001
From: shuai <shuailiu25@gmail.com>
Date: Thu, 2 Nov 2017 15:03:14 -0400
Subject: [PATCH 06/14] switch the subsampling to the original version

---
 src/alchemlyb/preprocessing/subsampling.py | 17 ++++-------------
 1 file changed, 4 insertions(+), 13 deletions(-)

diff --git a/src/alchemlyb/preprocessing/subsampling.py b/src/alchemlyb/preprocessing/subsampling.py
index cc6f201e..50703286 100644
--- a/src/alchemlyb/preprocessing/subsampling.py
+++ b/src/alchemlyb/preprocessing/subsampling.py
@@ -4,7 +4,6 @@
 import numpy as np
 from pymbar.timeseries import statisticalInefficiency
 from pymbar.timeseries import detectEquilibration
-from pymbar.timeseries import subsampleCorrelatedData
 
 
 def _check_multiple_times(df):
@@ -98,22 +97,14 @@ def statistical_inefficiency(df, series=None, lower=None, upper=None, step=None)
 
         # calculate statistical inefficiency of series
         statinef  = statisticalInefficiency(series)
-        
-        #use the subsampleCorrelatedData function to subsample the data
-        indices = subsampleCorrelatedData(series, g=statinef)
-        picked_time_index = []
-        #pick the time index for the pandas dataframe based on the python indices from subsample
-        for s_index, s_index_pair in enumerate(series.index):
-            if s_index in indices:
-                picked_time_index.append(s_index_pair[0])
 
         # we round up
-        #statinef = int(np.rint(statinef))
+        statinef = int(np.rint(statinef))
+
         # subsample according to statistical inefficiency
-        #series = series.iloc[::statinef]
+        series = series.iloc[::statinef]
 
-        #df = df.loc[series.index]
-        df = df.loc[picked_time_index]
+        df = df.loc[series.index]
     else:
         df = slicing(df, lower=lower, upper=upper, step=step)
     

From f64722de49b75136158b2ad6f2443166b0b0090e Mon Sep 17 00:00:00 2001
From: shuai <shuailiu25@gmail.com>
Date: Thu, 2 Nov 2017 15:04:29 -0400
Subject: [PATCH 07/14] clean up the amber parser, add logging info etc

---
 src/alchemlyb/parsing/amber.py | 169 ++++++++++++++++++---------------
 1 file changed, 93 insertions(+), 76 deletions(-)

diff --git a/src/alchemlyb/parsing/amber.py b/src/alchemlyb/parsing/amber.py
index bfae1ae6..1604170c 100644
--- a/src/alchemlyb/parsing/amber.py
+++ b/src/alchemlyb/parsing/amber.py
@@ -3,12 +3,16 @@
 Change the final format to pandas to be consistent with the alchemlyb format
 """
 
-import pandas as pd
+import os
 import re
+import pandas as pd
 import numpy as np
-import os
+import logging 
+
+logger = logging.getLogger("alchemlyb.parsers.Amber")
 
-def convert_to_pandas(file_datum, ):
+def convert_to_pandas(file_datum):
+    """Convert the data structure from numpy to pandas format"""
     data_dic = {}
     data_dic["dHdl"] = []
     data_dic["lambdas"] = []
@@ -68,9 +72,10 @@ def __init__(self, filename):
         try:
             self.fileh = open_it(self.filename, 'rb')
             self.filesize = os.stat(self.filename).st_size
-        except IOError:
-            raise SystemExit('ERROR: cannot open file %s' % filename)
+        except Exception as ex:
+            logging.exception("ERROR: cannot open file %s" % filename)
         self.lineno = 0
+
     def skip_lines(self, nlines):
         """Skip a given number of files."""
         lineno = 0
@@ -79,6 +84,7 @@ def skip_lines(self, nlines):
             if lineno > nlines:
                 return line
         return None
+
     def skip_after(self, pattern):
         """Skip until after a line that matches a regex pattern."""
         for line in self:
@@ -86,6 +92,7 @@ def skip_after(self, pattern):
             if match:
                 break
         return self.fileh.tell() != self.filesize
+
     def extract_section(self, start, end, fields, limit=None, extra='',
                         debug=False):
         """
@@ -122,8 +129,10 @@ def extract_section(self, start, end, fields, limit=None, extra='',
             else:                       # section may be incomplete
                 result.append(None)
         return result
+
     def __iter__(self):
         return self
+
     def next(self):
         """Read next line of the filehandle and check for EOF."""
         self.lineno += 1
@@ -132,19 +141,20 @@ def next(self):
             raise StopIteration
         # NOTE: can't mix next() with seek()
         return self.fileh.readline()
+
     def close(self):
         """Close the filehandle."""
         self.fileh.close()
+
     def __enter__(self):
         return self
+
     def __exit__(self, typ, value, traceback):
         self.close()
 
 class FEData(object):
     """A simple struct container to collect data from individual files."""
 
-    #__slots__ = ['clambda', 't0', 'dt', 'T', 'gradients',
-    #             'component_gradients', 'mbar_energies']
     __slots__ = ['clambda', 't0', 'dt', 'T', 'gradients',
                  'component_gradients']
 
@@ -155,19 +165,18 @@ def __init__(self):
         self.T = -1.0
         self.gradients = []
         self.component_gradients = []
-        #self.mbar_energies = []
-
 
-def file_validation(outfile, ):
+def file_validation(outfile):
+    """validate the energy output file """
     invalid = False
     with SectionParser(outfile) as secp:
         line = secp.skip_lines(5) 
         if not line:
-            print('  WARNING: file does not contain any useful data, '
-                      'ignoring file')
+            logging.warning('  WARNING: file does not contain any useful data, '
+                            'ignoring file')
             invalid = True
         if not secp.skip_after('^   2.  CONTROL  DATA  FOR  THE  RUN'):
-            print('  WARNING: no CONTROL DATA found, ignoring file')
+            logging.warning('  WARNING: no CONTROL DATA found, ignoring file')
             invalid = True
         ntpr, = secp.extract_section('^Nature and format of output:', '^$',
                                      ['ntpr'])
@@ -176,85 +185,93 @@ def file_validation(outfile, ):
         T, = secp.extract_section('temperature regulation:', '^$',
                                  ['temp0'])
         if not T:
-            raise SystemExit('ERROR: Non-constant temperature MD not '
-                             'currently supported')
+            logging.error('ERROR: Non-constant temperature MD not '
+                          'currently supported')
             invalid = True
         clambda, = secp.extract_section('^Free energy options:', '^$',
                                         ['clambda'], '^---')
         if clambda is None:
-            print('  WARNING: no free energy section found, ignoring file')
+            logging.warning('  WARNING: no free energy section found, ignoring file')
             invalid = True
 
         if not secp.skip_after('^   3.  ATOMIC '):
-            print('  WARNING: no ATOMIC section found, ignoring file\n')
+            logging.warning('  WARNING: no ATOMIC section found, ignoring file\n')
             invalid = True
 
         t0, = secp.extract_section('^ begin time', '^$', ['coords'])
         if not secp.skip_after('^   4.  RESULTS'):
-            print('  WARNING: no RESULTS section found, ignoring file\n')
+            logging.warning('  WARNING: no RESULTS section found, ignoring file\n')
             invalid = True
     if invalid:
         return False
-    else:
-        file_datum = FEData()
-        file_datum.clambda = clambda
-        file_datum.t0 = t0
-        file_datum.dt = dt
-        file_datum.T = T
-        return file_datum
+    file_datum = FEData()
+    file_datum.clambda = clambda
+    file_datum.t0 = t0
+    file_datum.dt = dt
+    file_datum.T = T
+    return file_datum
 
-def extract_dHdl(outfile, ):
-    file_datum = file_validation(outfile)
-    if file_validation(outfile):
-        finished = False
-        comps = []
-        with SectionParser(outfile) as secp:
-            line = secp.skip_lines(5)
-            nensec = 0
-            nenav = 0
-            old_nstep = -1
-            old_comp_nstep = -1
-            high_E_cnt = 0
+def extract_dHdl(outfile):
+    """Return gradients `dH/dl` from Amebr TI outputfile
+    Parameters
+    ----------
+    outfile : str
+        Path to Amber .out file to extract data from.
 
-            in_comps = False
-            for line in secp:
-                if 'DV/DL, AVERAGES OVER' in line:
-                    in_comps = True
-                if line.startswith(' NSTEP'):
-                    if in_comps:
-                        #CHECK the result
-                        result = secp.extract_section('^ NSTEP', '^ ---',
-                                                     ['NSTEP'] + DVDL_COMPS,
-                                                     extra=line)
-                        if result[0] != old_comp_nstep and not any_none(result):
-                            comps.append([float(E) for E in result[1:]])
-                            nenav += 1  
-                            old_comp_nstep = result[0]
-                        in_comps = False
-                    else:
-                        nstep, dvdl = secp.extract_section('^ NSTEP', '^ ---',
-                                                           ['NSTEP', 'DV/DL'],
-                                                           extra=line)
-                        if nstep != old_nstep and dvdl is not None \
-                                and nstep is not None:
-                            file_datum.gradients.append(dvdl)
-                            nensec += 1
-                            old_nstep = nstep
-                if line == '   5.  TIMINGS\n':
-                    finished = True
-                    break
-        if not finished:
-            print('  WARNING: prematurely terminated run')
-        if not nensec:
-            print('  WARNING: File %s does not contain any DV/DL data\n' %
-                  outfile)
-        print('%i data points, %i DV/DL averages' % (nensec, nenav))
-        #at this step we get info stored in the FEData object for a given amber out file
-        file_datum.component_gradients.extend(comps)
-        #convert file_datum to the pandas format to make it identical to alchemlyb output format
-        df = convert_to_pandas(file_datum)        
-    else:
-        df = None
+    Returns
+    -------
+    dH/dl : Series
+        dH/dl as a function of time for this lambda window.
+    """
+    file_datum = file_validation(outfile)
+    if not file_validation(outfile):
+        return None
+    finished = False
+    comps = []
+    with SectionParser(outfile) as secp:
+        line = secp.skip_lines(5)
+        nensec = 0
+        nenav = 0
+        old_nstep = -1
+        old_comp_nstep = -1
+        high_E_cnt = 0
+        in_comps = False
+        for line in secp:
+            if 'DV/DL, AVERAGES OVER' in line:
+                in_comps = True
+            if line.startswith(' NSTEP'):
+                if in_comps:
+                    #CHECK the result
+                    result = secp.extract_section('^ NSTEP', '^ ---',
+                                                 ['NSTEP'] + DVDL_COMPS,
+                                                 extra=line)
+                    if result[0] != old_comp_nstep and not any_none(result):
+                        comps.append([float(E) for E in result[1:]])
+                        nenav += 1  
+                        old_comp_nstep = result[0]
+                    in_comps = False
+                else:
+                    nstep, dvdl = secp.extract_section('^ NSTEP', '^ ---',
+                                                       ['NSTEP', 'DV/DL'],
+                                                       extra=line)
+                    if nstep != old_nstep and dvdl is not None \
+                            and nstep is not None:
+                        file_datum.gradients.append(dvdl)
+                        nensec += 1
+                        old_nstep = nstep
+            if line == '   5.  TIMINGS\n':
+                finished = True
+                break
+    if not finished:
+        logging.warning('  WARNING: prematurely terminated run')
+    if not nensec:
+        logging.warning('  WARNING: File %s does not contain any DV/DL data\n' %
+              outfile)
+    logging.info('%i data points, %i DV/DL averages' % (nensec, nenav))
+    #at this step we get info stored in the FEData object for a given amber out file
+    file_datum.component_gradients.extend(comps)
+    #convert file_datum to the pandas format to make it identical to alchemlyb output format
+    df = convert_to_pandas(file_datum)        
     return df
 
 #currently just check the code with a simple amber ti output file

From dcf997bdc561988c64efadd1ff94337d8efa4f76 Mon Sep 17 00:00:00 2001
From: shuai <shuailiu25@gmail.com>
Date: Fri, 3 Nov 2017 09:42:48 -0400
Subject: [PATCH 08/14] remove the main test function

---
 src/alchemlyb/parsing/amber.py | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/src/alchemlyb/parsing/amber.py b/src/alchemlyb/parsing/amber.py
index 1604170c..6441eddf 100644
--- a/src/alchemlyb/parsing/amber.py
+++ b/src/alchemlyb/parsing/amber.py
@@ -273,10 +273,3 @@ def extract_dHdl(outfile):
     #convert file_datum to the pandas format to make it identical to alchemlyb output format
     df = convert_to_pandas(file_datum)        
     return df
-
-#currently just check the code with a simple amber ti output file
-#likely to switch to the alchmetest frame with more testing cases 
-if ("__main__") == (__name__):
-    dataset = "./amber_dataset/ti-0.00.out"
-    df = extract_dHdl(dataset)
-    print "Check the df", df

From 3c299573a0d536929fa216bfa43574e03fcdf097 Mon Sep 17 00:00:00 2001
From: shuai <shuailiu25@gmail.com>
Date: Fri, 3 Nov 2017 12:17:19 -0400
Subject: [PATCH 09/14] change to amber parser to be compatible with python 3

---
 src/alchemlyb/parsing/amber.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/alchemlyb/parsing/amber.py b/src/alchemlyb/parsing/amber.py
index 6441eddf..2b24ba84 100644
--- a/src/alchemlyb/parsing/amber.py
+++ b/src/alchemlyb/parsing/amber.py
@@ -61,7 +61,7 @@ class SectionParser(object):
     def __init__(self, filename):
         """Opens a file according to its file type."""
         self.filename = filename
-        with open(filename, 'rb') as f:
+        with open(filename, 'r') as f:
             magic = f.read(3)   # NOTE: works because all 3-byte headers
         try:
             method = _MAGIC_CMPR[magic]
@@ -70,7 +70,7 @@ def __init__(self, filename):
         else:
             open_it = getattr(__import__(method[0]), method[1])
         try:
-            self.fileh = open_it(self.filename, 'rb')
+            self.fileh = open_it(self.filename, 'r')
             self.filesize = os.stat(self.filename).st_size
         except Exception as ex:
             logging.exception("ERROR: cannot open file %s" % filename)
@@ -141,6 +141,8 @@ def next(self):
             raise StopIteration
         # NOTE: can't mix next() with seek()
         return self.fileh.readline()
+    #make compatible with python 3.6
+    __next__ = next
 
     def close(self):
         """Close the filehandle."""
@@ -184,6 +186,7 @@ def file_validation(outfile):
                                           ['nstlim', 'dt'])
         T, = secp.extract_section('temperature regulation:', '^$',
                                  ['temp0'])
+        print ("Check the temperature", T)
         if not T:
             logging.error('ERROR: Non-constant temperature MD not '
                           'currently supported')

From a0e1a5bbc49b79e6f400a5f6fd6b101bf47ce20f Mon Sep 17 00:00:00 2001
From: shuai <shuailiu25@gmail.com>
Date: Fri, 3 Nov 2017 12:34:58 -0400
Subject: [PATCH 10/14] clean up a print line

---
 src/alchemlyb/parsing/amber.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/alchemlyb/parsing/amber.py b/src/alchemlyb/parsing/amber.py
index 2b24ba84..a2479787 100644
--- a/src/alchemlyb/parsing/amber.py
+++ b/src/alchemlyb/parsing/amber.py
@@ -186,7 +186,6 @@ def file_validation(outfile):
                                           ['nstlim', 'dt'])
         T, = secp.extract_section('temperature regulation:', '^$',
                                  ['temp0'])
-        print ("Check the temperature", T)
         if not T:
             logging.error('ERROR: Non-constant temperature MD not '
                           'currently supported')

From 3a7586675ed6e0a2ae1732215ec6fc8d702086af Mon Sep 17 00:00:00 2001
From: Shuai Liu <shuailiu25@gmail.com>
Date: Mon, 13 Nov 2017 00:33:40 -0800
Subject: [PATCH 11/14] add amber file validation test and switch the file
 opener to anyopen

---
 src/alchemlyb/parsing/amber.py            | 11 ++---------
 src/alchemlyb/tests/parsing/test_amber.py | 12 +++++++++++-
 2 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/src/alchemlyb/parsing/amber.py b/src/alchemlyb/parsing/amber.py
index a2479787..5b8a1e5b 100644
--- a/src/alchemlyb/parsing/amber.py
+++ b/src/alchemlyb/parsing/amber.py
@@ -8,6 +8,7 @@
 import pandas as pd
 import numpy as np
 import logging 
+from .util import anyopen
 
 logger = logging.getLogger("alchemlyb.parsers.Amber")
 
@@ -61,16 +62,8 @@ class SectionParser(object):
     def __init__(self, filename):
         """Opens a file according to its file type."""
         self.filename = filename
-        with open(filename, 'r') as f:
-            magic = f.read(3)   # NOTE: works because all 3-byte headers
         try:
-            method = _MAGIC_CMPR[magic]
-        except KeyError:
-            open_it = open
-        else:
-            open_it = getattr(__import__(method[0]), method[1])
-        try:
-            self.fileh = open_it(self.filename, 'r')
+            self.fileh = anyopen(self.filename, 'r')
             self.filesize = os.stat(self.filename).st_size
         except Exception as ex:
             logging.exception("ERROR: cannot open file %s" % filename)
diff --git a/src/alchemlyb/tests/parsing/test_amber.py b/src/alchemlyb/tests/parsing/test_amber.py
index ad762be3..400dac46 100644
--- a/src/alchemlyb/tests/parsing/test_amber.py
+++ b/src/alchemlyb/tests/parsing/test_amber.py
@@ -1,9 +1,10 @@
 """Amber parser tests.
 
 """
-
 from alchemlyb.parsing.amber import extract_dHdl
+from alchemlyb.parsing.amber import file_validation
 from alchemtest.amber import load_simplesolvated
+from alchemtest.amber import load_invalidfiles
 
 
 def test_dHdl():
@@ -19,3 +20,12 @@ def test_dHdl():
             assert dHdl.index.names == ['time', 'lambdas']
             assert dHdl.shape == (500, 1)
 
+def test_invalidfiles():
+    """Test the file validation function to ensure the function returning False if the file is invalid
+    
+    """
+    invalid_files = load_invalidfiles()
+    
+    for invalid_file_list in invalid_files['data']:
+        for invalid_file in invalid_file_list:
+            assert file_validation(invalid_file) == False

From 46eb7e9ba12680c9b9c4e31f783bad26124337ea Mon Sep 17 00:00:00 2001
From: Shuai Liu <shuailiu25@gmail.com>
Date: Mon, 13 Nov 2017 10:19:13 -0800
Subject: [PATCH 12/14] change the EOF detection/skip_after function for ziped
 file, fix some other minor issues

---
 src/alchemlyb/parsing/amber.py | 24 +++++++++---------------
 1 file changed, 9 insertions(+), 15 deletions(-)

diff --git a/src/alchemlyb/parsing/amber.py b/src/alchemlyb/parsing/amber.py
index 5b8a1e5b..cf9cafac 100644
--- a/src/alchemlyb/parsing/amber.py
+++ b/src/alchemlyb/parsing/amber.py
@@ -5,10 +5,11 @@
 
 import os
 import re
+import logging 
 import pandas as pd
 import numpy as np
-import logging 
-from .util import anyopen
+
+from util import anyopen
 
 logger = logging.getLogger("alchemlyb.parsers.Amber")
 
@@ -32,10 +33,6 @@ def convert_to_pandas(file_datum):
 DVDL_COMPS = ['BOND', 'ANGLE', 'DIHED', '1-4 NB', '1-4 EEL', 'VDWAALS',
               'EELEC', 'RESTRAINT']
 _FP_RE = r'[+-]?(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?'
-_MAGIC_CMPR = {
-    '\x1f\x8b\x08': ('gzip', 'GzipFile'),  # last byte is compression method
-    '\x42\x5a\x68': ('bz2', 'BZ2File')
-}
 
 def any_none(sequence):
     """Check if any element of a sequence is None."""
@@ -53,7 +50,7 @@ def _pre_gen(it, first):
         yield first
 
     while it:
-        yield it.next()
+        yield next(it)
 
 class SectionParser(object):
     """
@@ -64,7 +61,6 @@ def __init__(self, filename):
         self.filename = filename
         try:
             self.fileh = anyopen(self.filename, 'r')
-            self.filesize = os.stat(self.filename).st_size
         except Exception as ex:
             logging.exception("ERROR: cannot open file %s" % filename)
         self.lineno = 0
@@ -80,11 +76,13 @@ def skip_lines(self, nlines):
 
     def skip_after(self, pattern):
         """Skip until after a line that matches a regex pattern."""
+        Found_pattern = False
         for line in self:
             match = re.search(pattern, line)
             if match:
+                Found_pattern = True
                 break
-        return self.fileh.tell() != self.filesize
+        return Found_pattern
 
     def extract_section(self, start, end, fields, limit=None, extra='',
                         debug=False):
@@ -129,11 +127,7 @@ def __iter__(self):
     def next(self):
         """Read next line of the filehandle and check for EOF."""
         self.lineno += 1
-        curr_pos = self.fileh.tell()
-        if curr_pos == self.filesize:
-            raise StopIteration
-        # NOTE: can't mix next() with seek()
-        return self.fileh.readline()
+        return next(self.fileh)
     #make compatible with python 3.6
     __next__ = next
 
@@ -207,7 +201,7 @@ def file_validation(outfile):
     return file_datum
 
 def extract_dHdl(outfile):
-    """Return gradients `dH/dl` from Amebr TI outputfile
+    """Return gradients `dH/dl` from Amber TI outputfile
     Parameters
     ----------
     outfile : str

From 49cc45cf3f310e1a6eff1d4c15f6fa2c17adbd09 Mon Sep 17 00:00:00 2001
From: Shuai Liu <shuailiu25@gmail.com>
Date: Mon, 13 Nov 2017 10:40:08 -0800
Subject: [PATCH 13/14] change to .util

---
 src/alchemlyb/parsing/amber.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/alchemlyb/parsing/amber.py b/src/alchemlyb/parsing/amber.py
index cf9cafac..2f47f16a 100644
--- a/src/alchemlyb/parsing/amber.py
+++ b/src/alchemlyb/parsing/amber.py
@@ -9,7 +9,7 @@
 import pandas as pd
 import numpy as np
 
-from util import anyopen
+from .util import anyopen
 
 logger = logging.getLogger("alchemlyb.parsers.Amber")
 

From 3bc2c67b8e874ca8b4f099401c329f79d61525f5 Mon Sep 17 00:00:00 2001
From: Shuai Liu <shuailiu25@gmail.com>
Date: Mon, 13 Nov 2017 11:43:13 -0800
Subject: [PATCH 14/14] add test function for any_none function in amber parser

---
 src/alchemlyb/tests/parsing/test_amber.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/src/alchemlyb/tests/parsing/test_amber.py b/src/alchemlyb/tests/parsing/test_amber.py
index 400dac46..c293ec05 100644
--- a/src/alchemlyb/tests/parsing/test_amber.py
+++ b/src/alchemlyb/tests/parsing/test_amber.py
@@ -3,6 +3,7 @@
 """
 from alchemlyb.parsing.amber import extract_dHdl
 from alchemlyb.parsing.amber import file_validation
+from alchemlyb.parsing.amber import any_none
 from alchemtest.amber import load_simplesolvated
 from alchemtest.amber import load_invalidfiles
 
@@ -29,3 +30,9 @@ def test_invalidfiles():
     for invalid_file_list in invalid_files['data']:
         for invalid_file in invalid_file_list:
             assert file_validation(invalid_file) == False
+
+def test_any_none():
+    """Test the any None function to ensure if the None value will be caught
+    """
+    None_value_result = [150000, None, None, None, None, None, None, None, None]
+    assert any_none(None_value_result) == True