From 91a25dbf75179990f8a5d63f96c9a5ea878b0520 Mon Sep 17 00:00:00 2001 From: m-julian <52214154+m-julian@users.noreply.github.com> Date: Thu, 16 May 2024 21:03:51 +0100 Subject: [PATCH] new way of parsing HISTORY --- .../core/files/dl_poly/dl_poly_history.py | 360 +++++++++++++----- 1 file changed, 269 insertions(+), 91 deletions(-) diff --git a/ichor_core/ichor/core/files/dl_poly/dl_poly_history.py b/ichor_core/ichor/core/files/dl_poly/dl_poly_history.py index 3a5d42de..cb6dec29 100644 --- a/ichor_core/ichor/core/files/dl_poly/dl_poly_history.py +++ b/ichor_core/ichor/core/files/dl_poly/dl_poly_history.py @@ -100,6 +100,23 @@ class DlpolyHistory(Trajectory): Inherits from Trajectory as is a list of Atoms Builds on the Trajectory class by adding DLPOLY information provided by the HISTORY file + + .. warning:: + Indexing the history as a python list, i.e. history[1000] + is not guaranteed to give the 1000th timestep (0-indexed). + This is because sometimes there is binary written to the HISTORY + file which messes up some geometries. These geometries are excluded + from the read in data, so indexing as a list will might return a + different timestep. + + To make sure that the exact timestep is returned (useful when you + also want to get data from FFLUX or IQA_ENERGIES/IQA_FORCES file, + then ensure that you check the ``ntimestep`` attribute of a timestep). + This will be correct, even if some geoemtries are missing. + + To get a list of missing timesteps, use the self.removed_timesteps attribute + of the DlPolyHistory class. + """ _filetype = "" @@ -113,6 +130,7 @@ def __init__(self, path: Optional[Path] = Path("HISTORY")): self.periodic_boundary = FileContents self.number_of_atoms = FileContents self.ntimesteps = FileContents + self.existing_timesteps = FileContents self.removed_timesteps = FileContents @classmethod @@ -121,116 +139,276 @@ def check_path(cls, path: Path) -> bool: def _read_file(self): - with open(self.path, "r") as f: + # we will read the first lines to get number of atoms + # then read in a chunk of lines, if binary is encountered then discard all geometries + # this will likely discard extra geometries, but will ensure that + # all geometries are read in correctly at least + + try: + + with open(self.path, "r") as f: + + line = next(f) + self.title = line + line = next(f) + record = line.split() + self.trajectory_key = DlpolyTrajectoryKey(int(record[0])) + self.periodic_boundary = DlpolyTrajectoryKey(int(record[1])) + self.number_of_atoms = int(record[2]) + self.ntimesteps = int(record[3]) + + # read number of lines depending of whether + # coordinates, velocities, and/or forces are written in the file + if self.trajectory_key is DlpolyTrajectoryKey.Coordinate: + nlines_to_read = self.number_of_atoms * 2 + 3 + elif self.trajectory_key is DlpolyTrajectoryKey.CoordinateVelocity: + nlines_to_read = self.number_of_atoms * 3 + 3 + elif self.trajectory_key is DlpolyTrajectoryKey.CoordinateVelocityForce: + nlines_to_read = self.number_of_atoms * 4 + 3 - self.title = next(f) - record = next(f).split() - self.trajectory_key = DlpolyTrajectoryKey(int(record[0])) - self.periodic_boundary = DlpolyTrajectoryKey(int(record[1])) - self.number_of_atoms = int(record[2]) - self.ntimesteps = int(record[3]) - - removed_timesteps = [] - timestep_contains_binary = False - - try: while True: - # sometimes binary data is written in HISTORY file - # the binary is written to the same line as the next line that contains timestep info - if not timestep_contains_binary: - line = next(f) - - # reset the variables used to check if the timestep contains binary data - add_this_timestep = True + # default is that the timestep does not contain binary timestep_contains_binary = False - if "timestep" in line: - - # used for dealing with binary data in HISTORY file sometimes - # ideally this issue should be fixed in FFLUX - timestep = DlpolyTimestep() - - # record = 'timestep' ntimestep number_of_atoms keytraj keypbc timestep_length timestep - record = line.split() - if "\x00" in record[0]: - del record[0] - timestep.ntimestep = int(record[1]) - timestep.number_of_atoms = int(record[2]) - timestep.trajectory_key = DlpolyTrajectoryKey(int(record[3])) - timestep.periodic_boundary = DlpolyPeriodicBoundary( - int(record[4]) - ) - timestep.timestep_length = float(record[5]) - timestep.timestep = float(record[6]) - - timestep.unit_cell[0, :] = np.array( - [float(ai) for ai in next(f).split()] - ) # a vector of unit cell - timestep.unit_cell[1, :] = np.array( - [float(bi) for bi in next(f).split()] - ) # b vector of unit cell - timestep.unit_cell[2, :] = np.array( - [float(ci) for ci in next(f).split()] - ) # c vector of unit cell - - for _ in range(timestep.number_of_atoms): - - # for some reason binary is being written to the HISTORY file sometimes - # this then causes issues with parsing - # if the line has timestep in it, it means that the geometry is missing + line = next(f) + # if timestep is in beginning of line, this should be beginning of geometry + # and there should not be binary in that line + if "timestep" == line.split()[0]: + + this_timestep_lines = [] + this_timestep_lines.append(line) + for _ in range(nlines_to_read): line = next(f) - if "timestep" in line: - # do not add timestep to the resulting history - # ie history will have 1 less timestep - add_this_timestep = False - # this variable is used in the beginning of the while loop - # setting to True means that the next(f) would not be called + this_timestep_lines.append(line) + + # check if any line has binary + # if it does, then do not add this timestep + # doing so might also remove next timesteps + # because of lines missing that are read from the next geometry + for i in this_timestep_lines: + if "\x00" in i: timestep_contains_binary = True - # add the messed up timestep to a list - removed_timesteps.append(timestep.ntimestep) break - # record = atom_type atom_index atomic_mass charge - record = split_by(line, [8, 10, 12, 12]) + # if there is no binary in timestep, then it should have clean + # data, so we can read it as normal + # iterate over the list containing all tines for this timestep + if not timestep_contains_binary: - timestep_atom_type = str(record[0]) + this_timestep_lines = iter(this_timestep_lines) - timestep_atom_coordinates = np.array( - [float(ci) for ci in next(f).split()] - ) + timestep = DlpolyTimestep() - timestep_atom = DlpolyTimestepAtom( - timestep_atom_type, - timestep_atom_coordinates[0], - timestep_atom_coordinates[1], - timestep_atom_coordinates[2], - ) + # loop over lines in the timestep that has been read in + line = next(this_timestep_lines) + # record = 'timestep' ntimestep number_of_atoms keytraj keypbc timestep_length timestep + record = line.split() + # since this timestep not contain binary + # the timestep that is in the file should be correct + timestep.ntimestep = int(record[1]) - if timestep.trajectory_key in [ - DlpolyTrajectoryKey.CoordinateVelocity, - DlpolyTrajectoryKey.CoordinateVelocityForce, - ]: - timestep_atom.velocity = np.array( - [float(vi) for vi in next(f).split()] + timestep.number_of_atoms = int(record[2]) + timestep.trajectory_key = DlpolyTrajectoryKey( + int(record[3]) + ) + timestep.periodic_boundary = DlpolyPeriodicBoundary( + int(record[4]) + ) + timestep.timestep_length = float(record[5]) + timestep.timestep = float(record[6]) + + timestep.unit_cell[0, :] = np.array( + [float(ai) for ai in next(this_timestep_lines).split()] + ) # a vector of unit cell + timestep.unit_cell[1, :] = np.array( + [float(bi) for bi in next(this_timestep_lines).split()] + ) # b vector of unit cell + timestep.unit_cell[2, :] = np.array( + [float(ci) for ci in next(this_timestep_lines).split()] + ) # c vector of unit cell + + for _ in range(timestep.number_of_atoms): + + line = next(this_timestep_lines) + # record = atom_type atom_index atomic_mass charge + record = split_by(line, [8, 10, 12, 12]) + + timestep_atom_type = str(record[0]) + + line = next(this_timestep_lines) + timestep_atom_coordinates = np.array( + [float(ci) for ci in line.split()] ) - if ( - timestep.trajectory_key - is DlpolyTrajectoryKey.CoordinateVelocityForce - ): - timestep_atom.force = np.array( - [float(fi) for fi in next(f).split()] + timestep_atom = DlpolyTimestepAtom( + timestep_atom_type, + timestep_atom_coordinates[0], + timestep_atom_coordinates[1], + timestep_atom_coordinates[2], ) - timestep.add(timestep_atom) - - if add_this_timestep: + if timestep.trajectory_key in [ + DlpolyTrajectoryKey.CoordinateVelocity, + DlpolyTrajectoryKey.CoordinateVelocityForce, + ]: + timestep_atom.velocity = np.array( + [ + float(vi) + for vi in next(this_timestep_lines).split() + ] + ) + + if ( + timestep.trajectory_key + is DlpolyTrajectoryKey.CoordinateVelocityForce + ): + timestep_atom.force = np.array( + [ + float(fi) + for fi in next(this_timestep_lines).split() + ] + ) + + timestep.add(timestep_atom) + + # this timestep should be safe to add + # and no binary data should be present self.add(timestep) - # if end of file is reached, add the removed timesteps attribute - except StopIteration: - self.removed_timesteps = removed_timesteps + except StopIteration: + # these are the timesteps that are read in + # get the ntimestep attribute + # which should always be correct even if data is missing + existing_timesteps = [i.ntimestep for i in self] + + # these are the missing timesteps because of binary in HISTORY file + removed_timesteps = [] + # loop over all timesteps that are in the HISTORY file + # note that the initial geometry is also counted a timestep + # so setting the CONTROL timesteps to 500 for example will give 501 geometries in HISTORY file + for i in range(self.ntimesteps): + if i not in existing_timesteps: + removed_timesteps.append(i) + + self.existing_timesteps = existing_timesteps + self.removed_timesteps = removed_timesteps + + # TODO: potentially return this implementation once the issues with + # binary code in the HISTORY file is resolved. + + # def _read_file(self): + + # with open(self.path, "r") as f: + + # self.title = next(f) + # record = next(f).split() + # self.trajectory_key = DlpolyTrajectoryKey(int(record[0])) + # self.periodic_boundary = DlpolyTrajectoryKey(int(record[1])) + # self.number_of_atoms = int(record[2]) + # self.ntimesteps = int(record[3]) + + # removed_timesteps = [] + # timestep_contains_binary = False + + # try: + # while True: + + # # sometimes binary data is written in HISTORY file + # # the binary is written to the same line as the next line that contains timestep info + # if not timestep_contains_binary: + # line = next(f) + + # # reset the variables used to check if the timestep contains binary data + # add_this_timestep = True + # timestep_contains_binary = False + + # if "timestep" in line: + + # # used for dealing with binary data in HISTORY file sometimes + # # ideally this issue should be fixed in FFLUX + # timestep = DlpolyTimestep() + + # # record = 'timestep' ntimestep number_of_atoms keytraj keypbc timestep_length timestep + # record = line.split() + # if "\x00" in record[0]: + # del record[0] + # timestep.ntimestep = int(record[1]) + # timestep.number_of_atoms = int(record[2]) + # timestep.trajectory_key = DlpolyTrajectoryKey(int(record[3])) + # timestep.periodic_boundary = DlpolyPeriodicBoundary( + # int(record[4]) + # ) + # timestep.timestep_length = float(record[5]) + # timestep.timestep = float(record[6]) + + # timestep.unit_cell[0, :] = np.array( + # [float(ai) for ai in next(f).split()] + # ) # a vector of unit cell + # timestep.unit_cell[1, :] = np.array( + # [float(bi) for bi in next(f).split()] + # ) # b vector of unit cell + # timestep.unit_cell[2, :] = np.array( + # [float(ci) for ci in next(f).split()] + # ) # c vector of unit cell + + # for _ in range(timestep.number_of_atoms): + + # # for some reason binary is being written to the HISTORY file sometimes + # # this then causes issues with parsing + # # if the line has timestep in it, it means that the geometry is missing + # line = next(f) + # if "timestep" in line: + # # do not add timestep to the resulting history + # # ie history will have 1 less timestep + # add_this_timestep = False + # # this variable is used in the beginning of the while loop + # # setting to True means that the next(f) would not be called + # timestep_contains_binary = True + # # add the messed up timestep to a list + # removed_timesteps.append(timestep.ntimestep) + # break + + # # record = atom_type atom_index atomic_mass charge + # record = split_by(line, [8, 10, 12, 12]) + + # timestep_atom_type = str(record[0]) + + # timestep_atom_coordinates = np.array( + # [float(ci) for ci in next(f).split()] + # ) + + # timestep_atom = DlpolyTimestepAtom( + # timestep_atom_type, + # timestep_atom_coordinates[0], + # timestep_atom_coordinates[1], + # timestep_atom_coordinates[2], + # ) + + # if timestep.trajectory_key in [ + # DlpolyTrajectoryKey.CoordinateVelocity, + # DlpolyTrajectoryKey.CoordinateVelocityForce, + # ]: + # timestep_atom.velocity = np.array( + # [float(vi) for vi in next(f).split()] + # ) + + # if ( + # timestep.trajectory_key + # is DlpolyTrajectoryKey.CoordinateVelocityForce + # ): + # timestep_atom.force = np.array( + # [float(fi) for fi in next(f).split()] + # ) + + # timestep.add(timestep_atom) + + # if add_this_timestep: + # self.add(timestep) + + # # if end of file is reached, add the removed timesteps attribute + # except StopIteration: + # self.removed_timesteps = removed_timesteps @convert_to_path def write_to_trajectory(self, path: str = "TRAJECTORY.xyz"):