diff --git a/.travis-data/test_daemon.py b/.travis-data/test_daemon.py index 8fe4798975..b6d05f4491 100644 --- a/.travis-data/test_daemon.py +++ b/.travis-data/test_daemon.py @@ -41,30 +41,42 @@ def jobs_have_finished(pks): print "{}/{} finished".format(num_finished, len(finished_list)) return not (False in finished_list) +def print_logshow(pk): + print "Output of 'verdi calculation logshow {}':".format(pk) + try: + print subprocess.check_output( + ["verdi", "calculation", "logshow", "{}".format(pk)], + stderr=subprocess.STDOUT, + ) + except subprocess.CalledProcessError as e2: + print "Note: the command failed, message: {}".format(e2.message) + def validate_calculations(expected_results): valid = True - actual_value = None - for pk, expected_value in expected_results.iteritems(): + actual_dict = {} + for pk, expected_dict in expected_results.iteritems(): calc = load_node(pk) + if not calc.has_finished_ok(): + print 'Calculation<{}> status was not FINISHED'.format(pk) + print_logshow(pk) + return False + try: - actual_value = int(calc.out.retrieved.folder.open('path/output.txt').read()) - except (AttributeError, IOError, ValueError) as e: - print "* UNABLE TO RETRIEVE VALUE for calc pk={}: I expected {}, I got {}: {}".format( - pk, expected_value, type(e), e) - - print "Output of 'verdi calculation logshow {}':".format(pk) - try: - print subprocess.check_output( - ["verdi", "calculation", "logshow", "{}".format(pk)], - stderr=subprocess.STDOUT, - ) - except subprocess.CalledProcessError as e2: - print "Note: the command failed, message: {}".format(e2.message) - valid = False + actual_dict = calc.out.output_parameters.get_dict() + except (KeyError, AttributeError) as exception: + print 'Could not retrieve output_parameters node for Calculation<{}>'.format(pk) + print_logshow(pk) + return False - if actual_value != expected_value: + try: + actual_dict['retrieved_temporary_files'] = dict(actual_dict['retrieved_temporary_files']) + except KeyError: + # If the retrieval fails we simply pass as the following check of the actual value will fail anyway + pass + + if actual_dict != expected_dict: print "* UNEXPECTED VALUE {} for calc pk={}: I expected {}".format( - actual_value, pk, expected_value) + actual_dict, pk, expected_dict) valid = False return valid @@ -100,18 +112,25 @@ def main(): 'input_file_template': "{value}", # File just contains the value to double 'input_file_name': 'value_to_double.txt', 'output_file_name': 'output.txt', + 'retrieve_temporary_files': ['triple_value.tmp'] }) calc = code.new_calc() calc.set_max_wallclock_seconds(5 * 60) # 5 min calc.set_resources({"num_machines": 1}) calc.set_withmpi(False) + calc.set_parser_name('simpleplugins.templatereplacer.test.doubler') calc.use_parameters(parameters) calc.use_template(template) calc.store_all() print "[{}] created calculation {}, pk={}".format( counter, calc.uuid, calc.dbnode.pk) - expected_results_calculations[calc.pk] = inputval*2 + expected_results_calculations[calc.pk] = { + 'value': inputval * 2, + 'retrieved_temporary_files': { + 'triple_value.tmp': str(inputval * 3) + } + } calc.submit() print "[{}] calculation submitted.".format(counter) diff --git a/.travis-data/torquessh-doubler/doubler.sh b/.travis-data/torquessh-doubler/doubler.sh index cd5b1f8128..a82a219f39 100755 --- a/.travis-data/torquessh-doubler/doubler.sh +++ b/.travis-data/torquessh-doubler/doubler.sh @@ -11,3 +11,4 @@ fi INPUTVALUE=`cat value_to_double.txt` echo $(( $INPUTVALUE * 2 )) +echo $(( $INPUTVALUE * 3 )) > 'triple_value.tmp' diff --git a/aiida/common/datastructures.py b/aiida/common/datastructures.py index f0ebe1035f..ef0ff24050 100644 --- a/aiida/common/datastructures.py +++ b/aiida/common/datastructures.py @@ -19,18 +19,16 @@ class CalcState(Enumerate): 'NEW', # just created 'TOSUBMIT', # used by the executionmanager to submit new calculations scheduled to be submitted 'SUBMITTING', # being submitted to cluster - 'WITHSCHEDULER', # on the scheduler (on any unfinished status: - # QUEUED, QUEUED_HELD, SUSPENDED, RUNNING) - 'COMPUTED', # Calculation finished on scheduler, not yet retrieved - # (both DONE and FAILED) + 'WITHSCHEDULER', # on the scheduler (on any unfinished status: QUEUED, QUEUED_HELD, SUSPENDED, RUNNING) + 'COMPUTED', # calculation finished on scheduler, not yet retrieved (both DONE and FAILED) 'RETRIEVING', # while retrieving data 'PARSING', # while parsing data - 'FINISHED', # Final state of the calculation: data retrieved and eventually parsed + 'FINISHED', # final state of the calculation: data retrieved and eventually parsed 'SUBMISSIONFAILED', # error occurred during submission phase 'RETRIEVALFAILED', # error occurred during retrieval phase 'PARSINGFAILED', # error occurred during parsing phase due to a problem in the parse - 'FAILED', # The parser recognized the calculation as failed - 'IMPORTED', # The calculation was imported from another DB + 'FAILED', # the parser recognized the calculation as failed + 'IMPORTED', # the calculation was imported from another DB ) # The order of states is not random: is the order of precedence. @@ -83,14 +81,44 @@ def sort_states(list_states, use_key=False): class CalcInfo(DefaultFieldsAttributeDict): """ This object will store the data returned by the calculation plugin and to be - passed to the ExecManager + passed to the ExecManager. + + In the following descriptions all paths have to be considered relative + + * retrieve_list: a list of strings or tuples that indicate files that are to be retrieved from the remote + after the calculation has finished and stored in the repository in a FolderData. + If the entry in the list is just a string, it is assumed to be the filepath on the remote and it will + be copied to '.' of the repository with name os.path.split(item)[1] + If the entry is a tuple it is expected to have the following format + + ('remotepath', 'localpath', depth) + + If the 'remotepath' is a file or folder, it will be copied in the repository to 'localpath'. + However, if the 'remotepath' contains file patterns with wildcards, the 'localpath' should be set to '.' + and the depth parameter should be an integer that decides the localname. The 'remotepath' will be split on + file separators and the local filename will be determined by joining the N last elements, where N is + given by the depth variable. + + Example: ('some/remote/path/files/pattern*[0-9].xml', '.', 2) + + Will result in all files that match the pattern to be copied to the local repository with path + + 'files/pattern*[0-9].xml' + + * retrieve_temporary_list: a list of strings or tuples that indicate files that will be retrieved + and stored temporarily in a FolderData, that will be available only during the parsing call. + The format of the list is the same as that of 'retrieve_list' + + * retrieve_singlefile_list: a list of tuples with format + ('linkname_from calc to singlefile', 'subclass of singlefile', 'filename') + Each tuple represents a file that will be retrieved from cluster and saved in SinglefileData nodes + + * local_copy_list: a list of tuples with format ('localabspath', 'relativedestpath') + * remote_copy_list: a list of tuples with format ('remotemachinename', 'remoteabspath', 'relativedestpath') + * remote_symlink_list: a list of tuples with format ('remotemachinename', 'remoteabspath', 'relativedestpath') + * codes_info: a list of dictionaries used to pass the info of the execution of a code + * codes_run_mode: a string used to specify the order in which multi codes can be executed """ - # Note: some of the variables might have never been used in AiiDA - # one might want to clean all this stuff in a future revision - # Note: probably some of the fields below are not used anymore inside - # calcinfo, but are rather directly set from calculation attributes to - # the JobInfo to be passed to the ExecManager - # (see, for instance, 'queue_name'). _default_fields = ( 'job_environment', # TODO UNDERSTAND THIS! @@ -100,40 +128,20 @@ class CalcInfo(DefaultFieldsAttributeDict): 'uuid', 'prepend_text', 'append_text', -# 'cmdline_params', # as a list of strings. These 5 variables are now in CalcInfo -# 'stdin_name', -# 'stdout_name', -# 'stderr_name', -# 'join_files', - # 'queue_name', This is not used in CalcInfo, it is automatically set from - # calculation attributes to JobInfo 'num_machines', 'num_mpiprocs_per_machine', 'priority', 'max_wallclock_seconds', 'max_memory_kb', 'rerunnable', - 'retrieve_list', # a list of files or patterns to retrieve, with two - # possible formats: [ 'remotepath', # just the name of the file to retrieve. Will be put in '.' of the repositorym with name os.path.split(item)[1] - # ['remotepath','localpath',depth] ] - # second format will copy the remotepath file/folder to localpath. - # if remotepath is a file/folder, localpath will be its local name - # if remotepath has file patterns, localpath should only be '.' - # depth is an integer to decide the localname: will be os.path.join(localpath, filename ) - # where filename takes remotepath.split() and joins the last #depth elements - # use the second option if you are using file patterns (*,[0-9],...) - # ALL PATHS ARE RELATIVE! - 'local_copy_list', # a list of length-two tuples with (localabspath, relativedestpath) - 'remote_copy_list', # a list of length-three tuples with (remotemachinename, remoteabspath, relativedestpath) + 'retrieve_list', + 'retrieve_temporary_list', + 'retrieve_singlefile_list', + 'local_copy_list', + 'remote_copy_list', 'remote_symlink_list', - # a list of length-three tuples with (remotemachinename, remoteabspath, relativedestpath) - 'retrieve_singlefile_list', # a list of files, that will be retrieved - # from cluster and saved in SinglefileData nodes - # in the following format: - # ["linkname_from calc to singlefile","subclass of singlefile","filename"] - # filename remote = filename local - 'codes_info', # a list of dictionaries used to pass the info of the execution of a code. - 'codes_run_mode', # a string used to specify the order in which multi codes can be executed + 'codes_info', + 'codes_run_mode' ) @@ -148,8 +156,7 @@ class CodeRunmode(Enumerate): # if serial, it will be: # code1.x # code2.x -code_run_modes = CodeRunmode(('PARALLEL', - 'SERIAL')) +code_run_modes = CodeRunmode(('PARALLEL', 'SERIAL')) class CodeInfo(DefaultFieldsAttributeDict): diff --git a/aiida/daemon/execmanager.py b/aiida/daemon/execmanager.py index b695db46e6..58379f1e5d 100644 --- a/aiida/daemon/execmanager.py +++ b/aiida/daemon/execmanager.py @@ -708,6 +708,7 @@ def retrieve_computed_for_authinfo(authinfo): extra=logger_extra) workdir = calc._get_remote_workdir() retrieve_list = calc._get_retrieve_list() + retrieve_temporary_list = calc._get_retrieve_temporary_list() retrieve_singlefile_list = calc._get_retrieve_singlefile_list() execlogger.debug("[retrieval of calc {}] " "chdir {}".format(calc.pk, workdir), @@ -721,55 +722,9 @@ def retrieve_computed_for_authinfo(authinfo): # First, retrieve the files of folderdata with SandboxFolder() as folder: - for item in retrieve_list: - # I have two possibilities: - # * item is a string - # * or is a list - # then I have other two possibilities: - # * there are file patterns - # * or not - # First decide the name of the files - if isinstance(item, list): - tmp_rname, tmp_lname, depth = item - # if there are more than one file I do something differently - if t.has_magic(tmp_rname): - remote_names = t.glob(tmp_rname) - local_names = [] - for rem in remote_names: - to_append = rem.split(os.path.sep)[-depth:] if depth > 0 else [] - local_names.append(os.path.sep.join([tmp_lname] + to_append)) - else: - remote_names = [tmp_rname] - to_append = remote_names.split(os.path.sep)[-depth:] if depth > 0 else [] - local_names = [os.path.sep.join([tmp_lname] + to_append)] - if depth > 1: # create directories in the folder, if needed - for this_local_file in local_names: - new_folder = os.path.join( - folder.abspath, - os.path.split(this_local_file)[0]) - if not os.path.exists(new_folder): - os.makedirs(new_folder) - else: # it is a string - if t.has_magic(item): - remote_names = t.glob(item) - local_names = [os.path.split(rem)[1] for rem in remote_names] - else: - remote_names = [item] - local_names = [os.path.split(item)[1]] - - for rem, loc in zip(remote_names, local_names): - execlogger.debug("[retrieval of calc {}] " - "Trying to retrieve remote item '{}'".format( - calc.pk, rem), - extra=logger_extra) - t.get(rem, - os.path.join(folder.abspath, loc), - ignore_nonexisting=True) - - # Here I retrieved everything; - # now I store them inside the calculation - retrieved_files.replace_with_folder(folder.abspath, - overwrite=True) + retrieve_files_from_list(calc, t, folder, retrieve_list) + # Here I retrieved everything; now I store them inside the calculation + retrieved_files.replace_with_folder(folder.abspath, overwrite=True) # Second, retrieve the singlefiles with SandboxFolder() as folder: @@ -777,14 +732,10 @@ def retrieve_computed_for_authinfo(authinfo): for (linkname, subclassname, filename) in retrieve_singlefile_list: execlogger.debug("[retrieval of calc {}] Trying " "to retrieve remote singlefile '{}'".format( - calc.pk, filename), - extra=logger_extra) - localfilename = os.path.join( - folder.abspath, os.path.split(filename)[1]) - t.get(filename, localfilename, - ignore_nonexisting=True) - singlefile_list.append((linkname, subclassname, - localfilename)) + calc.pk, filename), extra=logger_extra) + localfilename = os.path.join(folder.abspath, os.path.split(filename)[1]) + t.get(filename, localfilename, ignore_nonexisting=True) + singlefile_list.append((linkname, subclassname, localfilename)) # ignore files that have not been retrieved singlefile_list = [i for i in singlefile_list if @@ -796,38 +747,47 @@ def retrieve_computed_for_authinfo(authinfo): SinglefileSubclass = DataFactory(subclassname) singlefile = SinglefileSubclass() singlefile.set_file(filename) - singlefile.add_link_from(calc, label=linkname, - link_type=LinkType.CREATE) + singlefile.add_link_from(calc, label=linkname, link_type=LinkType.CREATE) singlefiles.append(singlefile) - # Finally, store - execlogger.debug("[retrieval of calc {}] " - "Storing retrieved_files={}".format( - calc.pk, retrieved_files.dbnode.pk), - extra=logger_extra) + # Retrieve the temporary files in a separate temporary folder if any files were + # specified in the 'retrieve_temporary_list' key + if retrieve_temporary_list: + retrieved_temporary_folder = FolderData() + with SandboxFolder() as folder: + retrieve_files_from_list(calc, t, folder, retrieve_temporary_list) + retrieved_temporary_folder.replace_with_folder(folder.abspath, overwrite=True) + + # Log the files that were retrieved in the temporary folder + for entry in retrieved_temporary_folder.get_folder_list(): + execlogger.debug("[retrieval of calc {}] Retrieved temporary file or folder '{}'".format( + calc.pk, entry), extra=logger_extra) + else: + retrieved_temporary_folder = None + + # Finally, store the retrieved_files node. The retrieved_temporary_folder node + # is explicitly not stored, but will just be passed to the parser.parse_from calc call + execlogger.debug("[retrieval of calc {}] Storing retrieved_files={}".format( + calc.pk, retrieved_files.dbnode.pk), extra=logger_extra) retrieved_files.store() + for fil in singlefiles: - execlogger.debug("[retrieval of calc {}] " - "Storing retrieved_singlefile={}".format( - calc.pk, fil.dbnode.pk), - extra=logger_extra) + execlogger.debug("[retrieval of calc {}] Storing retrieved_singlefile={}".format( + calc.pk, fil.dbnode.pk), extra=logger_extra) fil.store() - # If I was the one retrieving, I should also be the only - # one parsing! I do not check + # If I was the one retrieving, I should also be the only one parsing! I do not check calc._set_state(calc_states.PARSING) Parser = calc.get_parserclass() # If no parser is set, the calculation is successful successful = True if Parser is not None: - # TODO: parse here parser = Parser(calc) - successful, new_nodes_tuple = parser.parse_from_calc() + successful, new_nodes_tuple = parser.parse_from_calc(retrieved_temporary_folder) for label, n in new_nodes_tuple: - n.add_link_from(calc, label=label, - link_type=LinkType.CREATE) + n.add_link_from(calc, label=label, link_type=LinkType.CREATE) n.store() if successful: @@ -878,3 +838,62 @@ def retrieve_computed_for_authinfo(authinfo): raise return retrieved + +def retrieve_files_from_list(calculation, transport, folder, retrieve_list): + """ + Retrieve all the files in the retrieve_list from the remote into the + local folder instance through the transport. The entries in the retrieve_list + can be of two types: + + * a string + * a list + + If it is a string, it represents the remote absolute filepath of the file. + If the item is a list, the elements will correspond to the following: + + * remotepath + * localpath + * depth + + If the remotepath contains file patterns with wildcards, the localpath will be + treated as the work directory of the folder and the depth integer determines + upto what level of the original remotepath nesting the files will be copied. + + :param transport: the Transport instance + :param folder: a local Folder instance for the transport to store files into + :param retrieve_list: the list of files to retrieve + """ + import os + + for item in retrieve_list: + if isinstance(item, list): + tmp_rname, tmp_lname, depth = item + # if there are more than one file I do something differently + if transport.has_magic(tmp_rname): + remote_names = transport.glob(tmp_rname) + local_names = [] + for rem in remote_names: + to_append = rem.split(os.path.sep)[-depth:] if depth > 0 else [] + local_names.append(os.path.sep.join([tmp_lname] + to_append)) + else: + remote_names = [tmp_rname] + to_append = remote_names.split(os.path.sep)[-depth:] if depth > 0 else [] + local_names = [os.path.sep.join([tmp_lname] + to_append)] + if depth > 1: # create directories in the folder, if needed + for this_local_file in local_names: + new_folder = os.path.join( + folder.abspath, + os.path.split(this_local_file)[0]) + if not os.path.exists(new_folder): + os.makedirs(new_folder) + else: # it is a string + if transport.has_magic(item): + remote_names = transport.glob(item) + local_names = [os.path.split(rem)[1] for rem in remote_names] + else: + remote_names = [item] + local_names = [os.path.split(item)[1]] + + for rem, loc in zip(remote_names, local_names): + transport.logger.debug("[retrieval of calc {}] Trying to retrieve remote item '{}'".format(calculation.pk, rem)) + transport.get(rem, os.path.join(folder.abspath, loc), ignore_nonexisting=True) \ No newline at end of file diff --git a/aiida/orm/calculation/job/simpleplugins/templatereplacer.py b/aiida/orm/calculation/job/simpleplugins/templatereplacer.py index be9d32cc8c..f35f02fb7c 100644 --- a/aiida/orm/calculation/job/simpleplugins/templatereplacer.py +++ b/aiida/orm/calculation/job/simpleplugins/templatereplacer.py @@ -7,77 +7,76 @@ # For further information on the license, see the LICENSE.txt file # # For further information please visit http://www.aiida.net # ########################################################################### -""" -This is a simple plugin that takes two node inputs, both of type ParameterData, -with the following labels: template and parameters. -You can also add other SinglefileData nodes as input, that will be copied according to -what is written in 'template' (see below). +from aiida.common.exceptions import InputValidationError +from aiida.common.datastructures import CalcInfo, CodeInfo +from aiida.common.utils import classproperty +from aiida.orm.calculation.job import JobCalculation +from aiida.orm.data.parameter import ParameterData -* parameters: a set of parameters that will be used for substitution. -* template: can contain the following parameters: +class TemplatereplacerCalculation(JobCalculation): + """ + Simple stub of a plugin that can be used to replace some text in a given template. + Can be used for many different codes, or as a starting point to develop a new plugin. - * input_file_template: a string with substitutions to be managed with the format()\ - function of python, i.e. if you want to substitute a variable called 'varname', you write\ - {varname} in the text. See http://www.python.org/dev/peps/pep-3101/ for more\ - details. The replaced file will be the input file. + This simple plugin takes two node inputs, both of type ParameterData, with the labels + 'parameters' and 'template' - * input_file_name: a string with the file name for the input. If it is not provided, no\ - file will be created. + You can also add other SinglefileData nodes as input, that will be copied according to + what is written in 'template' (see below). - * output_file_name: a string with the file name for the output. If it is not provided, no\ - redirection will be done and the output will go in the scheduler output file. + * parameters: a set of parameters that will be used for substitution. - * cmdline_params: a list of strings, to be passed as command line parameters.\ - Each one is substituted with the same rule of input_file_template. Optional + * template: can contain the following parameters: - * input_through_stdin: if True, the input file name is passed via stdin. Default is\ - False if missing. + * input_file_template: a string with substitutions to be managed with the format() + function of python, i.e. if you want to substitute a variable called 'varname', you write + {varname} in the text. See http://www.python.org/dev/peps/pep-3101/ for more + details. The replaced file will be the input file. - * files_to_copy: if defined, a list of tuple pairs, with format ('link_name', 'dest_rel_path');\ - for each tuple, an input link to this calculation is looked for, with link labeled 'link_label',\ - and with file type 'Singlefile', and the content is copied to a remote file named 'dest_rel_path'\ - Errors are raised in the input links are non-existent, or of the wrong type, or if there are \ - unused input files. + * input_file_name: a string with the file name for the input. If it is not provided, no + file will be created. -TODO: probably use Python's Template strings instead?? -TODO: catch exceptions -""" -from aiida.orm.calculation.job import JobCalculation -from aiida.common.exceptions import InputValidationError -from aiida.common.datastructures import CalcInfo, CodeInfo -from aiida.common.utils import classproperty -from aiida.orm.data.parameter import ParameterData + * output_file_name: a string with the file name for the output. If it is not provided, no + redirection will be done and the output will go in the scheduler output file. -# TODO: write a 'input_type_checker' routine to automatically check the existence -# and type of inputs + default values etc. + * cmdline_params: a list of strings, to be passed as command line parameters. + Each one is substituted with the same rule of input_file_template. Optional + * input_through_stdin: if True, the input file name is passed via stdin. Default is False if missing. + * files_to_copy: if defined, a list of tuple pairs, with format ('link_name', 'dest_rel_path'); + for each tuple, an input link to this calculation is looked for, with link labeled 'link_label', + and with file type 'Singlefile', and the content is copied to a remote file named 'dest_rel_path' + Errors are raised in the input links are non-existent, or of the wrong type, or if there are + unused input files. -class TemplatereplacerCalculation(JobCalculation): - """ - Simple stub of a plugin that can be used to replace some text in a given - template. Can be used for many different codes, or as a starting point - to develop a new plugin. + * retrieve_temporary_files: a list of relative filepaths, that if defined, will be retrieved and + temporarily stored in an unstored FolderData node that will be available during the + Parser.parser_with_retrieved call under the key specified by the Parser.retrieved_temporary_folder key + + TODO: probably use Python's Template strings instead?? + TODO: catch exceptions + TODO: write a 'input_type_checker' routine to automatically check the existence and type of inputs + default values etc. """ @classproperty def _use_methods(cls): retdict = JobCalculation._use_methods retdict.update({ - "template": { - 'valid_types': ParameterData, - 'additional_parameter': None, - 'linkname': 'template', - 'docstring': "A template for the input file", - }, - "parameters": { - 'valid_types': ParameterData, - 'additional_parameter': None, - 'linkname': 'parameters', - 'docstring': "Parameters used to replace placeholders in the template", - }, - }) + 'template': { + 'valid_types': ParameterData, + 'additional_parameter': None, + 'linkname': 'template', + 'docstring': 'A template for the input file', + }, + 'parameters': { + 'valid_types': ParameterData, + 'additional_parameter': None, + 'linkname': 'parameters', + 'docstring': 'Parameters used to replace placeholders in the template', + }, + }) return retdict def _prepare_for_submission(self, tempfolder, inputdict): @@ -92,7 +91,6 @@ def _prepare_for_submission(self, tempfolder, inputdict): """ import StringIO - from aiida.orm.data.parameter import ParameterData from aiida.orm.data.singlefile import SinglefileData from aiida.orm.data.remote import RemoteData from aiida.common.utils import validate_list_of_string_tuples @@ -107,16 +105,16 @@ def _prepare_for_submission(self, tempfolder, inputdict): template_node = inputdict.pop('template', None) template = template_node.get_dict() - input_file_template = template.pop('input_file_template', "") + input_file_template = template.pop('input_file_template', '') input_file_name = template.pop('input_file_name', None) output_file_name = template.pop('output_file_name', None) cmdline_params_tmpl = template.pop('cmdline_params', []) input_through_stdin = template.pop('input_through_stdin', False) files_to_copy = template.pop('files_to_copy', []) + retrieve_temporary_files = template.pop('retrieve_temporary_files', []) if template: - raise InputValidationError("The following keys could not be " - "used in the template node: {}".format( + raise InputValidationError('The following keys could not be used in the template node: {}'.format( template.keys())) try: @@ -150,8 +148,7 @@ def _prepare_for_submission(self, tempfolder, inputdict): if len(inputdict) > 0: raise InputValidationError("The input nodes with the following labels could not be " - "used by the templatereplacer plugin: {}".format( - inputdict.keys())) + "used by the templatereplacer plugin: {}".format(inputdict.keys())) if input_file_name is not None and not input_file_template: raise InputValidationError("If you give an input_file_name, you " @@ -172,6 +169,7 @@ def _prepare_for_submission(self, tempfolder, inputdict): calcinfo = CalcInfo() calcinfo.retrieve_list = [] + calcinfo.retrieve_temporary_list = [] calcinfo.uuid = self.uuid calcinfo.local_copy_list = local_copy_list @@ -179,11 +177,17 @@ def _prepare_for_submission(self, tempfolder, inputdict): codeinfo = CodeInfo() codeinfo.cmdline_params = cmdline_params + if input_through_stdin is not None: codeinfo.stdin_name = input_file_name + if output_file_name: codeinfo.stdout_name = output_file_name calcinfo.retrieve_list.append(output_file_name) + + if retrieve_temporary_files: + calcinfo.retrieve_temporary_list = retrieve_temporary_files + codeinfo.code_uuid = code.uuid calcinfo.codes_info = [codeinfo] diff --git a/aiida/orm/implementation/general/calculation/job/__init__.py b/aiida/orm/implementation/general/calculation/job/__init__.py index 336e2c9090..93b31b7f35 100644 --- a/aiida/orm/implementation/general/calculation/job/__init__.py +++ b/aiida/orm/implementation/general/calculation/job/__init__.py @@ -675,12 +675,10 @@ def _get_remote_workdir(self): return self.get_attr('remote_workdir', None) def _set_retrieve_list(self, retrieve_list): - if self.get_state() not in (calc_states.SUBMITTING, - calc_states.NEW): + if self.get_state() not in (calc_states.SUBMITTING, calc_states.NEW): raise ModificationNotAllowed( - "Cannot set the retrieve_list for a calculation " - "that is neither NEW nor SUBMITTING (current state is " - "{})".format(self.get_state())) + "Cannot set the retrieve_list for a calculation that is neither " + "NEW nor SUBMITTING (current state is {})".format(self.get_state())) # accept format of: [ 'remotename', # ['remotepath','localpath',0] ] @@ -717,16 +715,54 @@ def _get_retrieve_list(self): """ return self.get_attr('retrieve_list', None) + def _set_retrieve_temporary_list(self, retrieve_temporary_list): + """ + Set the list of paths that are to retrieved for parsing and be deleted as soon + as the parsing has been completed. + """ + if self.get_state() not in (calc_states.SUBMITTING, calc_states.NEW): + raise ModificationNotAllowed( + 'Cannot set the retrieve_temporary_list for a calculation that is neither ' + 'NEW nor SUBMITTING (current state is {})'.format(self.get_state())) + + if not (isinstance(retrieve_temporary_list, (tuple, list))): + raise ValueError('You should pass a list/tuple') + + for item in retrieve_temporary_list: + if not isinstance(item, basestring): + if (not (isinstance(item, (tuple, list))) or len(item) != 3): + raise ValueError( + 'You should pass a list containing either ' + 'strings or lists/tuples' + ) + + if (not (isinstance(item[0], basestring)) or + not (isinstance(item[1], basestring)) or + not (isinstance(item[2], int))): + raise ValueError( + 'You have to pass a list (or tuple) of lists, with remotepath(string), ' + 'localpath(string) and depth (integer)' + ) + + self._set_attr('retrieve_temporary_list', retrieve_temporary_list) + + def _get_retrieve_temporary_list(self): + """ + Get the list of files/directories to be retrieved on the cluster and will be kept temporarily during parsing. + Their path is relative to the remote workdirectory path. + + :return: a list of strings for file/directory names + """ + return self.get_attr('retrieve_temporary_list', None) + def _set_retrieve_singlefile_list(self, retrieve_singlefile_list): """ Set the list of information for the retrieval of singlefiles """ - if self.get_state() not in (calc_states.SUBMITTING, - calc_states.NEW): + if self.get_state() not in (calc_states.SUBMITTING, calc_states.NEW): raise ModificationNotAllowed( - "Cannot set the retrieve_singlefile_list for a calculation " - "that is neither NEW nor SUBMITTING (current state is " - "{})".format(self.get_state())) + "Cannot set the retrieve_singlefile_list for a calculation that is neither " + "NEW nor SUBMITTING (current state is {})".format(self.get_state())) if not isinstance(retrieve_singlefile_list, (tuple, list)): raise ValueError("You have to pass a list (or tuple) of lists of " @@ -1457,6 +1493,10 @@ def _presubmit(self, folder, use_unstored_links=False): self.pk, FileSubclass.__name__)) self._set_retrieve_singlefile_list(retrieve_singlefile_list) + # Handle the retrieve_temporary_list + retrieve_temporary_list = (calcinfo.retrieve_temporary_list if calcinfo.retrieve_temporary_list is not None else []) + self._set_retrieve_temporary_list(retrieve_temporary_list) + # the if is done so that if the method returns None, this is # not added. This has two advantages: # - it does not add too many \n\n if most of the prepend_text are empty diff --git a/aiida/parsers/parser.py b/aiida/parsers/parser.py index 0bd1108d1f..a09e4eda51 100644 --- a/aiida/parsers/parser.py +++ b/aiida/parsers/parser.py @@ -13,7 +13,6 @@ """ - class Parser(object): """ Base class for a parser object. @@ -24,16 +23,12 @@ class Parser(object): Get the child Folderdata, parse it and store the parsed data. """ _linkname_outparams = 'output_parameters' + _retrieved_temporary_folder_key = 'retrieved_temporary_folder' def __init__(self, calc): - """ - Init - """ from aiida.common import aiidalogger - self._logger = aiidalogger.getChild('parser').getChild( - self.__class__.__name__) - + self._logger = aiidalogger.getChild('parser').getChild( self.__class__.__name__) self._calc = calc @property @@ -45,22 +40,31 @@ def logger(self): import logging from aiida.utils.logger import get_dblogger_extra - return logging.LoggerAdapter(logger=self._logger, - extra=get_dblogger_extra(self._calc)) + return logging.LoggerAdapter(logger=self._logger, extra=get_dblogger_extra(self._calc)) + + @property + def retrieved_temporary_folder_key(self): + """ + Return the key under which the retrieved_temporary_folder will be passed in the + dictionary of retrieved nodes in the parse_with_retrieved method + """ + return self._retrieved_temporary_folder_key def parse_with_retrieved(self, retrieved): """ Receives in input a dictionary of retrieved nodes. Implement all the logic in this function of the subclass. + + :param retrieved: dictionary of retrieved nodes """ raise NotImplementedError - def parse_from_calc(self): + def parse_from_calc(self, retrieved_temporary_folder=None): """ Parses the datafolder, stores results. Main functionality of the class. If you only have one retrieved node, you do not need to reimplement this. Implement only the - parse_from_retrieved + parse_with_retrieved """ # select the folder object out_folder = self._calc.get_retrieved_node() @@ -68,8 +72,13 @@ def parse_from_calc(self): self.logger.error("No retrieved folder found") return False, () - return self.parse_with_retrieved( - {self._calc._get_linkname_retrieved(): out_folder}) + retrieved = {self._calc._get_linkname_retrieved(): out_folder} + + if retrieved_temporary_folder is not None: + key = self.retrieved_temporary_folder_key + retrieved[key] = retrieved_temporary_folder + + return self.parse_with_retrieved(retrieved) @classmethod def get_linkname_outparams(self): diff --git a/aiida/parsers/simpleplugins/__init__.py b/aiida/parsers/simpleplugins/__init__.py new file mode 100644 index 0000000000..ef686bb748 --- /dev/null +++ b/aiida/parsers/simpleplugins/__init__.py @@ -0,0 +1,11 @@ +# -*- coding: utf-8 -*- +########################################################################### +# Copyright (c), The AiiDA team. All rights reserved. # +# This file is part of the AiiDA code. # +# # +# The code is hosted on GitHub at https://github.com/aiidateam/aiida_core # +# For further information on the license, see the LICENSE.txt file # +# For further information please visit http://www.aiida.net # +########################################################################### + + diff --git a/aiida/parsers/simpleplugins/templatereplacer/__init__.py b/aiida/parsers/simpleplugins/templatereplacer/__init__.py new file mode 100644 index 0000000000..ef686bb748 --- /dev/null +++ b/aiida/parsers/simpleplugins/templatereplacer/__init__.py @@ -0,0 +1,11 @@ +# -*- coding: utf-8 -*- +########################################################################### +# Copyright (c), The AiiDA team. All rights reserved. # +# This file is part of the AiiDA code. # +# # +# The code is hosted on GitHub at https://github.com/aiidateam/aiida_core # +# For further information on the license, see the LICENSE.txt file # +# For further information please visit http://www.aiida.net # +########################################################################### + + diff --git a/aiida/parsers/simpleplugins/templatereplacer/test.py b/aiida/parsers/simpleplugins/templatereplacer/test.py new file mode 100644 index 0000000000..7ffd8308e8 --- /dev/null +++ b/aiida/parsers/simpleplugins/templatereplacer/test.py @@ -0,0 +1,76 @@ +# -*- coding: utf-8 -*- +from aiida.orm import CalculationFactory +from aiida.parsers.parser import Parser +from aiida.orm.data.parameter import ParameterData + +TemplatereplacerCalculation = CalculationFactory('simpleplugins.templatereplacer') + +class TemplatereplacerDoublerParser(Parser): + + def __init__(self, calc): + """ + Initialize the Parser for a TemplatereplacerCalculation + + :param calculation: instance of the TemplatereplacerCalculation + """ + if not isinstance(calc, TemplatereplacerCalculation): + raise ValueError('Input calculation must be of type {}'.format(type(TemplatereplacerCalculation))) + + super(TemplatereplacerDoublerParser, self).__init__(calc) + + def parse_with_retrieved(self, retrieved): + """ + Parse the output nodes for a PwCalculations from a dictionary of retrieved nodes. + Two nodes that are expected are the default 'retrieved' FolderData node which will + store the retrieved files permanently in the repository. The second required node + is the unstored FolderData node with the temporary retrieved files, which should + be passed under the key 'retrieved_temporary_folder_key' of the Parser class. + + :param retrieved: a dictionary of retrieved nodes + """ + output_nodes = [] + + try: + output_file = self._calc.inp.template.get_dict()['output_file_name'] + except KeyError: + self.logger.error("the output file name 'output_file_name' was not specified in the 'template' input node") + return False, () + + retrieved_folder = retrieved[self._calc._get_linkname_retrieved()] + try: + parsed_value = int(retrieved_folder.get_file_content(output_file).strip()) + except (AttributeError, IOError, ValueError) as e: + self.logger.error("* UNABLE TO RETRIEVE VALUE for calc pk={}: I got {}: {}".format(self._calc.pk, type(e), e)) + return False, () + + output_dict = { + 'value': parsed_value, + 'retrieved_temporary_files': [] + } + + try: + retrieve_temporary_files = self._calc.inp.template.get_dict()['retrieve_temporary_files'] + except KeyError: + retrieve_temporary_files = None + + # If the 'retrieve_temporary_files' key was set in the template input node, we expect a temporary + # FolderData node in the 'retrieved' arguments + if retrieve_temporary_files is not None: + try: + temporary_folder = retrieved[self.retrieved_temporary_folder_key] + except KeyError: + self.logger.error('the {} was not passed as an argument'.format(self.retrieved_temporary_folder_key)) + return False, () + + for retrieved_file in retrieve_temporary_files: + if retrieved_file not in temporary_folder.get_folder_list(): + self.logger.error('the file {} was not found in the temporary retrieved folder'.format(retrieved_file)) + return False, () + + # We always strip the content of the file from whitespace to simplify testing for expected output + output_dict['retrieved_temporary_files'].append((retrieved_file, temporary_folder.get_file_content(retrieved_file).strip())) + + output_parameters = ParameterData(dict=output_dict) + output_nodes.append((self.get_linkname_outparams(), output_parameters)) + + return True, output_nodes \ No newline at end of file diff --git a/docs/source/developer_guide/devel_tutorial/code_plugin_int_sum.rst b/docs/source/developer_guide/devel_tutorial/code_plugin_int_sum.rst index a11308f6b4..d6051785fb 100644 --- a/docs/source/developer_guide/devel_tutorial/code_plugin_int_sum.rst +++ b/docs/source/developer_guide/devel_tutorial/code_plugin_int_sum.rst @@ -163,6 +163,7 @@ summation code (a detailed description of the different sections follows):: calcinfo.local_copy_list = [] calcinfo.remote_copy_list = [] calcinfo.retrieve_list = [self._DEFAULT_OUTPUT_FILE] + calcinfo.retrieve_temporary_list = [['path/hugefiles*[0-9].xml', '.', '1']] codeinfo = CodeInfo() codeinfo.cmdline_params = [self._DEFAULT_INPUT_FILE,self._DEFAULT_OUTPUT_FILE] @@ -276,6 +277,29 @@ into the AiiDA database:: calcinfo.retrieve_list = [self._DEFAULT_OUTPUT_FILE] +The entries of the list should either be a string, which corresponds to the full +filepath of the file on the remote, or if you want to specify a group of files with +wildcards, it should be another list containing the following three items + +* Remote path with wildcards e.g. ``some/path/bigfiles*[0-9].xml`` +* Local path, which should always be ``'.'`` in this case of using wildcards +* Depth, which is an integer that indicates to what level the nested subtree structure should be kept. + For example in this example, with a depth of ``1``, the matched files will be copied to the + root directory as ``bigfiles*[0-9].xml``. For ``depth=1``, the sub path ``path`` will be included + and the files will be copied as ``path/bigfiles*[0-9].xml`` + +There is another field that follows exactly the same syntax as the ``retrieve_list`` but behaves a little differently. + + calcinfo.retrieve_temporary_list = [['some/path/bigfiles*[0-9].xml', '.', 0]] + +The difference is that these files will be retrieved and stored in a temporary folder, that will only +be available during the parsing of the calculation. After the parsing is completed, successfully or not, the +files will be deleted. This is useful if during parsing, one wants to analyze the contents of big files and +parse a small subset of the data to keep permanently, but does not want to have the store the raw files themselves +which would unnecessarily increase the size of the repository. The files that are retrieved will be stored in +a temporary ``FolderData`` and be passed as an argument to the ``parse_with_retrieved`` method of the ``Parser`` +class, which is implemented by the specific plugin. It will be passed under the key ``retrieved_temporary_folder``. + For the time being, just define also the following variables as empty lists (we will describe them in the next sections):: diff --git a/docs/source/developer_guide/devel_tutorial/code_plugin_qe.rst b/docs/source/developer_guide/devel_tutorial/code_plugin_qe.rst index 5c6cc989db..03df564af2 100644 --- a/docs/source/developer_guide/devel_tutorial/code_plugin_qe.rst +++ b/docs/source/developer_guide/devel_tutorial/code_plugin_qe.rst @@ -300,6 +300,8 @@ How does the method ``_prepare_for_submission`` work in practice? ### Modify here ! calcinfo.retrieve_list.append('Every file/folder you want to store back locally') ### Modify here! + calcinfo.retrieve_temporary_list = [] + ### Modify here! calcinfo.retrieve_singlefile_list = [] ### Modify here and put a name for standard input/output files @@ -316,44 +318,52 @@ How does the method ``_prepare_for_submission`` work in practice? There are a couple of things to be set on calcinfo. - 1. ``retrieve_list``: a list of relative file pathnames, that will be copied + 1. ``local_copy_list``: a list of length-two-tuples: ``('localabspath', + 'relativedestpath')``. Files to be copied from the aiida server to the cluster. + + 2. ``remote_copy_list``: a list of tuples: ``('remotemachinename', 'remoteabspath', + 'relativedestpath')``. Files/folders to be copied from a remote source to a + remote destination, sitting both on the same machine. + + 3. ``retrieve_list``: a list of relative file pathnames, that will be copied from the cluster to the aiida server, after the calculation has run on cluster. Note that all the file names you need to modify are not absolute path names (you don't know the name of the folder where it will be created) but rather the path relative to the scratch folder. - 2. ``local_copy_list``: a list of length-two-tuples: (localabspath, - relativedestpath). Files to be copied from the aiida server to the cluster. - - 3. ``remote_copy_list``: a list of tuples: (remotemachinename, remoteabspath, - relativedestpath). Files/folders to be copied from a remote source to a - remote destination, sitting both on the same machine. - - 4. ``retrieve_singlefile_list``: a list of triplets, in the form - ``["linkname_from calc to singlefile","subclass of - singlefile","filename"]``. If this is specified, at the end of the - calculation it will be created a SinglefileData-like object in the + 4. ``retrieve_temporary_list``: a list of relative file pathnames, that will be copied + from the cluster to the aiida server, after the calculation has run on the cluster, + that will only be available during the parsing. After parsing has completed, + be it successfully or not, the copied files will be lost. This is useful if one needs + to retrieve big files that are required for the parsing, but that one does not want to + store permanently in the repository. The input format follows the exact same rules as + that of the ``retrieve_list``. + + 5. ``retrieve_singlefile_list``: a list of triplets, in the form + ``['linkname_from calc to singlefile', 'subclass of + singlefile', 'filename']``. If this is specified, at the end of the + calculation it will be created a ``SinglefileData``-like object in the Database, children of the calculation, if of course the file is found on the cluster. - 5. codes_info: a list of informations that needs to be passed on the command - line to the code, passed in the form of a list of CalcInfo objects (see later). + 6. codes_info: a list of informations that needs to be passed on the command + line to the code, passed in the form of a list of ``CalcInfo`` objects (see later). Every element in this list corresponds to a call to a code that will be executed in the *same* scheduling job. This can be useful if a code needs to execute a short preprocessing. For long preprocessings, consider to develop a separate plugin. - 6. ``codes_run_mode``: a string, only necessary if you want to run more than one code + 7. ``codes_run_mode``: a string, only necessary if you want to run more than one code in the same scheduling job. Determines the order in which the multiple codes are run (i.e. sequentially or all at the same time. - It assumes one of the values of aiida.common.datastructures.code_run_modes, - like code_run_modes.PARALLEL or code_run_modes.SERIAL + It assumes one of the values of ``aiida.common.datastructures.code_run_modes``, + like ``code_run_modes.PARALLEL`` or ``code_run_modes.SERIAL`` - A CodeInfo object, as said before, describes how a code has to be executed. - The list of CodeInfo objects passed to calcinfo will determined the ordered + A ``CodeInfo`` object, as said before, describes how a code has to be executed. + The list of ``CodeInfo`` objects passed to ``CalcInfo`` will determined the ordered execution of one (or more) calls to executables. - The attributes that can be set to CodeInfo are: + The attributes that can be set to ``CodeInfo`` are: 1. ``stdin_name``: the name of the standard input. diff --git a/setup.py b/setup.py index 59c852d8a1..a65ee06df5 100644 --- a/setup.py +++ b/setup.py @@ -71,7 +71,9 @@ 'upf = aiida.orm.data.upf:UpfData' ], 'aiida.cmdline': [], - 'aiida.parsers': [], + 'aiida.parsers': [ + 'simpleplugins.templatereplacer.test.doubler = aiida.parsers.simpleplugins.templatereplacer.test:TemplatereplacerDoublerParser', + ], 'aiida.schedulers': [ 'direct = aiida.scheduler.plugins.direct:DirectScheduler', 'slurm = aiida.scheduler.plugins.slurm:SlurmScheduler',