From 94c1148b680fccdf9406fc5162f9e55cd74d58ad Mon Sep 17 00:00:00 2001 From: David Emms Date: Thu, 7 May 2020 14:30:32 +0100 Subject: [PATCH] Check early if open file limit is too low, resolves #384 --- scripts_of/__main__.py | 62 +++++++++++++-- scripts_of/files.py | 2 +- scripts_of/parallel_task_manager.py | 2 +- scripts_of/trees2ologs_of.py | 119 +++++++++++++++------------- scripts_of/util.py | 32 +++++++- 5 files changed, 150 insertions(+), 67 deletions(-) diff --git a/scripts_of/__main__.py b/scripts_of/__main__.py index a5fa1ce6..7aef52da 100755 --- a/scripts_of/__main__.py +++ b/scripts_of/__main__.py @@ -1604,8 +1604,16 @@ def ProcessesNewFasta(fastaDir, speciesInfoObj_prev = None, speciesToUse_prev_na speciesInfoObj.speciesToUse = speciesInfoObj.speciesToUse + newSpeciesIDs speciesInfoObj.nSpAll = max(speciesInfoObj.speciesToUse) + 1 # will be one of the new species return speciesInfoObj - -def CheckOptions(options): + +def DeleteDirectoryTree(d): + if os.path.exists(d): + try: + shutil.rmtree(d) + except OSError: + time.sleep(1) + shutil.rmtree(d, True) + +def CheckOptions(options, speciesToUse): """Check any optional arguments are valid once we know what species are in the analysis - user supplied species tree """ @@ -1619,7 +1627,45 @@ def CheckOptions(options): if options.qStopAfterAlignments and (not options.qMSATrees): print("ERROR: Must use '-M msa' option to generate sequence files and infer multiple sequence alignments for orthogroups") util.Fail() - + + # check can open enough files + n_extra = 50 + q_do_orthologs = not any((options.qStopAfterPrepare, options.qStopAfterGroups, options.qStopAfterSeqs, options.qStopAfterAlignments, options.qStopAfterTrees)) + if q_do_orthologs and not options.qStartFromTrees: + n_sp = len(speciesToUse) + wd = files.FileHandler.GetWorkingDirectory_Write() + wd_files_test = wd + "Files_test/" + fh = [] + try: + if not os.path.exists(wd_files_test): + os.mkdir(wd_files_test) + for i_sp in range(n_sp): + di = wd_files_test + "Sp%d/" % i_sp + if not os.path.exists(di): + os.mkdir(di) + for j_sp in range(n_sp): + fnij = di + "Sp%d.txt" % j_sp + fh.append(open(fnij, 'w')) + # create a few extra files to be safe + for i_extra in range(n_extra): + fh.append(open(wd_files_test + "Extra%d.txt" % i_extra, 'w')) + # close the files again and delete + for fhh in fh: + fhh.close() + DeleteDirectoryTree(wd_files_test) + except IOError as e: + if str(e).startswith("[Errno 24] Too many open files"): + util.number_open_files_exception_advice(len(speciesToUse), False) + for fhh in fh: + fhh.close() + DeleteDirectoryTree(wd_files_test) + util.Fail() + else: + for fhh in fh: + fhh.close() + DeleteDirectoryTree(wd_files_test) + print("ERROR: Attempted to open required files for OrthoFinder run but an unexpected error occurred. \n\nStacktrace:") + raise return options def main(args=None): @@ -1646,7 +1692,7 @@ def main(args=None): # 3. speciesInfoObj = ProcessesNewFasta(fastaDir, speciesInfoObj, speciesToUse_names) files.FileHandler.LogSpecies() - options = CheckOptions(options) + options = CheckOptions(options, speciesInfoObj.speciesToUse) # 4. seqsInfo = util.GetSeqsInfo(files.FileHandler.GetWorkingDirectory1_Read(), speciesInfoObj.speciesToUse, speciesInfoObj.nSpAll) # 5. @@ -1667,7 +1713,7 @@ def main(args=None): speciesInfoObj = None speciesInfoObj = ProcessesNewFasta(fastaDir) files.FileHandler.LogSpecies() - options = CheckOptions(options) + options = CheckOptions(options, speciesInfoObj.speciesToUse) # 4 seqsInfo = util.GetSeqsInfo(files.FileHandler.GetWorkingDirectory1_Read(), speciesInfoObj.speciesToUse, speciesInfoObj.nSpAll) # 5. @@ -1688,7 +1734,7 @@ def main(args=None): speciesInfoObj, _ = ProcessPreviousFiles(files.FileHandler.GetWorkingDirectory1_Read(), options.qDoubleBlast) files.FileHandler.LogSpecies() print("Using previously calculated BLAST results in %s" % (files.FileHandler.GetWorkingDirectory1_Read()[0])) - options = CheckOptions(options) + options = CheckOptions(options, speciesInfoObj.speciesToUse) # 4. seqsInfo = util.GetSeqsInfo(files.FileHandler.GetWorkingDirectory1_Read(), speciesInfoObj.speciesToUse, speciesInfoObj.nSpAll) # 5. @@ -1703,13 +1749,13 @@ def main(args=None): # 0. speciesInfoObj, _ = ProcessPreviousFiles(continuationDir, options.qDoubleBlast) files.FileHandler.LogSpecies() - options = CheckOptions(options) + options = CheckOptions(options, speciesInfoObj.speciesToUse) # 9 GetOrthologues(speciesInfoObj, options, prog_caller) elif options.qStartFromTrees: speciesInfoObj, _ = ProcessPreviousFiles(files.FileHandler.GetWorkingDirectory1_Read(), options.qDoubleBlast) files.FileHandler.LogSpecies() - options = CheckOptions(options) + options = CheckOptions(options, speciesInfoObj.speciesToUse) GetOrthologues_FromTrees(options) else: raise NotImplementedError diff --git a/scripts_of/files.py b/scripts_of/files.py index bb80765d..916eccc7 100644 --- a/scripts_of/files.py +++ b/scripts_of/files.py @@ -795,7 +795,7 @@ def InitialiseFileHandler(options, fastaDir=None, continuationDir=None, resultsD Implementation 1. Working out if an old directory structure is being used - 2. Construct and apporpriate PreviousFilesLocator if necessary - this locates all required files + 2. Construct and appropriate PreviousFilesLocator if necessary - this locates all required files 3. Pass this to FileHandler - this creates the directory structure required for this run 4. if error: print and exit 5. Return FileHandler diff --git a/scripts_of/parallel_task_manager.py b/scripts_of/parallel_task_manager.py index c3be28a3..0019bdb5 100644 --- a/scripts_of/parallel_task_manager.py +++ b/scripts_of/parallel_task_manager.py @@ -355,7 +355,7 @@ def Fail(): sys.stderr.flush() ptm = ParallelTaskManager_singleton() ptm.Stop() - print("ERROR: An error occurred, please review error messages for more information.") + print("ERROR: An error occurred, ***please review the error messages*** they may contain useful information about the problem.") sys.exit(1) diff --git a/scripts_of/trees2ologs_of.py b/scripts_of/trees2ologs_of.py index c97b5513..629cecd7 100644 --- a/scripts_of/trees2ologs_of.py +++ b/scripts_of/trees2ologs_of.py @@ -784,62 +784,69 @@ def __exit__(self, type, value, traceback): def DoOrthologuesForOrthoFinder(ogSet, species_tree_rooted_labelled, GeneToSpecies, all_stride_dup_genes, qNoRecon, hog_writer): """ """ - # Create directory structure - speciesDict = ogSet.SpeciesDict() - SequenceDict = ogSet.SequenceDict() - # Write directory and file structure - qInitialisedSuspectGenesDirs = False - speciesIDs = ogSet.speciesToUse - nspecies = len(speciesIDs) - dResultsOrthologues = files.FileHandler.GetOrthologuesDirectory() - for index1 in xrange(nspecies): - d = dResultsOrthologues + "Orthologues_" + speciesDict[str(speciesIDs[index1])] + "/" - if not os.path.exists(d): os.mkdir(d) - for index2 in xrange(nspecies): - if index2 == index1: continue - with open(d + '%s__v__%s.tsv' % (speciesDict[str(speciesIDs[index1])], speciesDict[str(speciesIDs[index2])]), csv_write_mode) as outfile: - writer1 = csv.writer(outfile, delimiter="\t") - writer1.writerow(("Orthogroup", speciesDict[str(speciesIDs[index1])], speciesDict[str(speciesIDs[index2])])) - neighbours = GetSpeciesNeighbours(species_tree_rooted_labelled) - # Infer orthologues and write them to file - nOgs = len(ogSet.OGs()) - nOrthologues_SpPair = util.nOrtho_sp(nspecies) - species = list(speciesDict.keys()) - reconTreesRenamedDir = files.FileHandler.GetOGsReconTreeDir(True) - spec_seq_dict = ogSet.Spec_SeqDict() - sp_to_index = {str(sp):i for i, sp in enumerate(ogSet.speciesToUse)} - with open(files.FileHandler.GetDuplicationsFN(), csv_write_mode) as outfile, OrthologsFiles(dResultsOrthologues, speciesDict, ogSet.speciesToUse, nspecies, sp_to_index) as (ortholog_file_writers, suspect_genes_file_writers): - dupWriter = csv.writer(outfile, delimiter="\t") - dupWriter.writerow(["Orthogroup", "Species Tree Node", "Gene Tree Node", "Support", "Type", "Genes 1", "Genes 2"]) - for iog in range(nOgs): - rooted_tree_ids, qHaveSupport = CheckAndRootTree(files.FileHandler.GetOGsTreeFN(iog), species_tree_rooted_labelled, GeneToSpecies) # this can be parallelised easily - if rooted_tree_ids is None: continue - # Write rooted tree with accessions - util.RenameTreeTaxa(rooted_tree_ids, files.FileHandler.GetOGsTreeFN(iog, True), spec_seq_dict, qSupport=qHaveSupport, qFixNegatives=True, qViaCopy=True) - orthologues, recon_tree, suspect_genes = GetOrthologues_from_tree(iog, rooted_tree_ids, species_tree_rooted_labelled, GeneToSpecies, neighbours, dupsWriter=dupWriter, seqIDs=spec_seq_dict, spIDs=ogSet.SpeciesDict(), all_stride_dup_genes=all_stride_dup_genes, qNoRecon=qNoRecon) - GetHOGs_from_tree(iog, recon_tree, hog_writer) - qContainsSuspectGenes = len(suspect_genes) > 0 - if (not qInitialisedSuspectGenesDirs) and qContainsSuspectGenes: - qInitialisedSuspectGenesDirs = True - dSuspectGenes = files.FileHandler.GetSuspectGenesDir() - dSuspectOrthologues = files.FileHandler.GetPutativeXenelogsDir() - for index1 in xrange(nspecies): - with open(dSuspectOrthologues + '%s.tsv' % speciesDict[str(speciesIDs[index1])], csv_write_mode) as outfile: - writer1 = csv.writer(outfile, delimiter="\t") - writer1.writerow(("Orthogroup", speciesDict[str(speciesIDs[index1])], "Other")) - for index0 in xrange(nspecies): - strsp0 = species[index0] - strsp0_ = strsp0+"_" - these_genes = [g for g in suspect_genes if g.startswith(strsp0_)] - if len(these_genes) > 0: - with open(dSuspectGenes + speciesDict[strsp0] + ".txt", 'a') as outfile: - outfile.write("\n".join([SequenceDict[g] for g in these_genes]) + "\n") - allOrthologues = [(iog, orthologues)] - # don't relabel nodes, they've already been done - util.RenameTreeTaxa(recon_tree, reconTreesRenamedDir + "OG%07d_tree.txt" % iog, spec_seq_dict, qSupport=False, qFixNegatives=True) - if iog >= 0 and divmod(iog, 10 if nOgs <= 200 else 100 if nOgs <= 2000 else 1000)[1] == 0: - util.PrintTime("Done %d of %d" % (iog, nOgs)) - nOrthologues_SpPair += AppendOrthologuesToFiles(allOrthologues, speciesDict, ogSet.speciesToUse, SequenceDict, dResultsOrthologues, ortholog_file_writers, suspect_genes_file_writers, qContainsSuspectGenes) + try: + # Create directory structure + speciesDict = ogSet.SpeciesDict() + SequenceDict = ogSet.SequenceDict() + # Write directory and file structure + qInitialisedSuspectGenesDirs = False + speciesIDs = ogSet.speciesToUse + nspecies = len(speciesIDs) + dResultsOrthologues = files.FileHandler.GetOrthologuesDirectory() + for index1 in xrange(nspecies): + d = dResultsOrthologues + "Orthologues_" + speciesDict[str(speciesIDs[index1])] + "/" + if not os.path.exists(d): os.mkdir(d) + for index2 in xrange(nspecies): + if index2 == index1: continue + with open(d + '%s__v__%s.tsv' % (speciesDict[str(speciesIDs[index1])], speciesDict[str(speciesIDs[index2])]), csv_write_mode) as outfile: + writer1 = csv.writer(outfile, delimiter="\t") + writer1.writerow(("Orthogroup", speciesDict[str(speciesIDs[index1])], speciesDict[str(speciesIDs[index2])])) + neighbours = GetSpeciesNeighbours(species_tree_rooted_labelled) + # Infer orthologues and write them to file + nOgs = len(ogSet.OGs()) + nOrthologues_SpPair = util.nOrtho_sp(nspecies) + species = list(speciesDict.keys()) + reconTreesRenamedDir = files.FileHandler.GetOGsReconTreeDir(True) + spec_seq_dict = ogSet.Spec_SeqDict() + sp_to_index = {str(sp):i for i, sp in enumerate(ogSet.speciesToUse)} + with open(files.FileHandler.GetDuplicationsFN(), csv_write_mode) as outfile, OrthologsFiles(dResultsOrthologues, speciesDict, ogSet.speciesToUse, nspecies, sp_to_index) as (ortholog_file_writers, suspect_genes_file_writers): + dupWriter = csv.writer(outfile, delimiter="\t") + dupWriter.writerow(["Orthogroup", "Species Tree Node", "Gene Tree Node", "Support", "Type", "Genes 1", "Genes 2"]) + for iog in range(nOgs): + rooted_tree_ids, qHaveSupport = CheckAndRootTree(files.FileHandler.GetOGsTreeFN(iog), species_tree_rooted_labelled, GeneToSpecies) # this can be parallelised easily + if rooted_tree_ids is None: continue + # Write rooted tree with accessions + util.RenameTreeTaxa(rooted_tree_ids, files.FileHandler.GetOGsTreeFN(iog, True), spec_seq_dict, qSupport=qHaveSupport, qFixNegatives=True, qViaCopy=True) + orthologues, recon_tree, suspect_genes = GetOrthologues_from_tree(iog, rooted_tree_ids, species_tree_rooted_labelled, GeneToSpecies, neighbours, dupsWriter=dupWriter, seqIDs=spec_seq_dict, spIDs=ogSet.SpeciesDict(), all_stride_dup_genes=all_stride_dup_genes, qNoRecon=qNoRecon) + GetHOGs_from_tree(iog, recon_tree, hog_writer) + qContainsSuspectGenes = len(suspect_genes) > 0 + if (not qInitialisedSuspectGenesDirs) and qContainsSuspectGenes: + qInitialisedSuspectGenesDirs = True + dSuspectGenes = files.FileHandler.GetSuspectGenesDir() + dSuspectOrthologues = files.FileHandler.GetPutativeXenelogsDir() + for index1 in xrange(nspecies): + with open(dSuspectOrthologues + '%s.tsv' % speciesDict[str(speciesIDs[index1])], csv_write_mode) as outfile: + writer1 = csv.writer(outfile, delimiter="\t") + writer1.writerow(("Orthogroup", speciesDict[str(speciesIDs[index1])], "Other")) + for index0 in xrange(nspecies): + strsp0 = species[index0] + strsp0_ = strsp0+"_" + these_genes = [g for g in suspect_genes if g.startswith(strsp0_)] + if len(these_genes) > 0: + with open(dSuspectGenes + speciesDict[strsp0] + ".txt", 'a') as outfile: + outfile.write("\n".join([SequenceDict[g] for g in these_genes]) + "\n") + allOrthologues = [(iog, orthologues)] + # don't relabel nodes, they've already been done + util.RenameTreeTaxa(recon_tree, reconTreesRenamedDir + "OG%07d_tree.txt" % iog, spec_seq_dict, qSupport=False, qFixNegatives=True) + if iog >= 0 and divmod(iog, 10 if nOgs <= 200 else 100 if nOgs <= 2000 else 1000)[1] == 0: + util.PrintTime("Done %d of %d" % (iog, nOgs)) + nOrthologues_SpPair += AppendOrthologuesToFiles(allOrthologues, speciesDict, ogSet.speciesToUse, SequenceDict, dResultsOrthologues, ortholog_file_writers, suspect_genes_file_writers, qContainsSuspectGenes) + except IOError as e: + if str(e).startswith("[Errno 24] Too many open files"): + util.number_open_files_exception_advice(len(ogSet.speciesToUse), True) + util.Fail() + else: + raise return nOrthologues_SpPair diff --git a/scripts_of/util.py b/scripts_of/util.py index e1e5c978..79e692b6 100644 --- a/scripts_of/util.py +++ b/scripts_of/util.py @@ -378,7 +378,37 @@ def FlowText(text, n=60): lines += text text = "" return lines - + +def number_open_files_exception_advice(n_species, q_at_trees): + """ + Prints advice for user on "IOError: [Errno 24] Too many open files" exception + Args: + n_species - the number of species in the analysis + q_at_trees - has this error occurred at the orthologs from trees stage + """ + # parallel_task_manager.RunCommand("ulimit -Hn") + n_req = n_species*n_species + 100 + msg="\nERROR: The system limits on the number of files a process can open is too low. For %d species \ +OrthoFinder needs to be able to open at least r=%d files. Please increase the limit and restart OrthoFinder\n\ +1. Check the hard and soft limits on the number of open files for your system:\n\ + $ ulimit -Hn\n\ + $ ulimit -Sn\n\ +2. If hard limit, h > r already, then you just need to increase the soft limit:\n\ + $ ulimit -n %d\n\ +3. Alternatively, if h < r then you need to edit the file '/etc/security/limits.conf', this requires root privileges. \ +To increase the limit to %d for user called 'emms' add the lines:\n\ + emms hard nofile %d\n\ + emms soft nofile %d\n" % (n_species, n_req, n_req, n_req, n_req, n_req) + msg +=" (edit these lines to match your username)\n\ +4. Check the limit has now been updated (if you changed the hard limit you'll need to open a new session and confirm it's updated):\n\ + $ ulimit -Sn" + + if q_at_trees: + msg_part_2 = "5. Once the limit is updated restart OrthoFinder 'from trees' using the '-ft' command" + else: + msg_part_2 = "5. Once the limit is updated restart OrthoFinder with the original command" + msg_part_3 = "\nFor full details see: https://github.com/davidemms/OrthoFinder/issues/384" + print(msg + "\n" + msg_part_2 + "\n" + msg_part_3 + "\n") """ ------------------------------------------------------------------------------- """