From 94c1148b680fccdf9406fc5162f9e55cd74d58ad Mon Sep 17 00:00:00 2001
From: David Emms <david_emms@hotmail.com>
Date: Thu, 7 May 2020 14:30:32 +0100
Subject: [PATCH] Check early if open file limit is too low, resolves #384

---
 scripts_of/__main__.py              |  62 +++++++++++++--
 scripts_of/files.py                 |   2 +-
 scripts_of/parallel_task_manager.py |   2 +-
 scripts_of/trees2ologs_of.py        | 119 +++++++++++++++-------------
 scripts_of/util.py                  |  32 +++++++-
 5 files changed, 150 insertions(+), 67 deletions(-)

diff --git a/scripts_of/__main__.py b/scripts_of/__main__.py
index a5fa1ce6..7aef52da 100755
--- a/scripts_of/__main__.py
+++ b/scripts_of/__main__.py
@@ -1604,8 +1604,16 @@ def ProcessesNewFasta(fastaDir, speciesInfoObj_prev = None, speciesToUse_prev_na
     speciesInfoObj.speciesToUse = speciesInfoObj.speciesToUse + newSpeciesIDs
     speciesInfoObj.nSpAll = max(speciesInfoObj.speciesToUse) + 1      # will be one of the new species
     return speciesInfoObj
-            
-def CheckOptions(options):
+
+def DeleteDirectoryTree(d):
+    if os.path.exists(d): 
+        try:
+            shutil.rmtree(d)
+        except OSError:
+            time.sleep(1)
+            shutil.rmtree(d, True)   
+
+def CheckOptions(options, speciesToUse):
     """Check any optional arguments are valid once we know what species are in the analysis
     - user supplied species tree
     """
@@ -1619,7 +1627,45 @@ def CheckOptions(options):
     if options.qStopAfterAlignments and (not options.qMSATrees):
         print("ERROR: Must use '-M msa' option to generate sequence files and infer multiple sequence alignments for orthogroups")
         util.Fail()
-    
+
+    # check can open enough files
+    n_extra = 50
+    q_do_orthologs = not any((options.qStopAfterPrepare, options.qStopAfterGroups, options.qStopAfterSeqs, options.qStopAfterAlignments, options.qStopAfterTrees))
+    if q_do_orthologs and not options.qStartFromTrees:
+        n_sp = len(speciesToUse)
+        wd = files.FileHandler.GetWorkingDirectory_Write()
+        wd_files_test = wd + "Files_test/"
+        fh = []
+        try:
+            if not os.path.exists(wd_files_test):
+                os.mkdir(wd_files_test)
+            for i_sp in range(n_sp):
+                di = wd_files_test + "Sp%d/" % i_sp
+                if not os.path.exists(di):
+                    os.mkdir(di)
+                for j_sp in range(n_sp):
+                    fnij = di + "Sp%d.txt" % j_sp
+                    fh.append(open(fnij, 'w'))
+            # create a few extra files to be safe
+            for i_extra in range(n_extra):
+                fh.append(open(wd_files_test + "Extra%d.txt" % i_extra, 'w'))
+            # close the files again and delete
+            for fhh in fh:
+                fhh.close()
+            DeleteDirectoryTree(wd_files_test)
+        except IOError as e:
+            if str(e).startswith("[Errno 24] Too many open files"):
+                util.number_open_files_exception_advice(len(speciesToUse), False)
+                for fhh in fh:
+                    fhh.close()
+                DeleteDirectoryTree(wd_files_test)
+                util.Fail()
+            else:
+                for fhh in fh:
+                    fhh.close()
+                DeleteDirectoryTree(wd_files_test)
+                print("ERROR: Attempted to open required files for OrthoFinder run but an unexpected error occurred. \n\nStacktrace:")
+                raise
     return options
 
 def main(args=None):    
@@ -1646,7 +1692,7 @@ def main(args=None):
             # 3. 
             speciesInfoObj = ProcessesNewFasta(fastaDir, speciesInfoObj, speciesToUse_names)
             files.FileHandler.LogSpecies()
-            options = CheckOptions(options)
+            options = CheckOptions(options, speciesInfoObj.speciesToUse)
             # 4.
             seqsInfo = util.GetSeqsInfo(files.FileHandler.GetWorkingDirectory1_Read(), speciesInfoObj.speciesToUse, speciesInfoObj.nSpAll)
             # 5.
@@ -1667,7 +1713,7 @@ def main(args=None):
             speciesInfoObj = None
             speciesInfoObj = ProcessesNewFasta(fastaDir)
             files.FileHandler.LogSpecies()
-            options = CheckOptions(options)
+            options = CheckOptions(options, speciesInfoObj.speciesToUse)
             # 4
             seqsInfo = util.GetSeqsInfo(files.FileHandler.GetWorkingDirectory1_Read(), speciesInfoObj.speciesToUse, speciesInfoObj.nSpAll)
             # 5.
@@ -1688,7 +1734,7 @@ def main(args=None):
             speciesInfoObj, _ = ProcessPreviousFiles(files.FileHandler.GetWorkingDirectory1_Read(), options.qDoubleBlast)
             files.FileHandler.LogSpecies()
             print("Using previously calculated BLAST results in %s" % (files.FileHandler.GetWorkingDirectory1_Read()[0]))
-            options = CheckOptions(options)
+            options = CheckOptions(options, speciesInfoObj.speciesToUse)
             # 4.
             seqsInfo = util.GetSeqsInfo(files.FileHandler.GetWorkingDirectory1_Read(), speciesInfoObj.speciesToUse, speciesInfoObj.nSpAll)
             # 5.
@@ -1703,13 +1749,13 @@ def main(args=None):
             # 0.  
             speciesInfoObj, _ = ProcessPreviousFiles(continuationDir, options.qDoubleBlast)
             files.FileHandler.LogSpecies()
-            options = CheckOptions(options)
+            options = CheckOptions(options, speciesInfoObj.speciesToUse)
             # 9
             GetOrthologues(speciesInfoObj, options, prog_caller)
         elif options.qStartFromTrees:
             speciesInfoObj, _ = ProcessPreviousFiles(files.FileHandler.GetWorkingDirectory1_Read(), options.qDoubleBlast)
             files.FileHandler.LogSpecies()
-            options = CheckOptions(options)
+            options = CheckOptions(options, speciesInfoObj.speciesToUse)
             GetOrthologues_FromTrees(options)
         else:
             raise NotImplementedError
diff --git a/scripts_of/files.py b/scripts_of/files.py
index bb80765d..916eccc7 100644
--- a/scripts_of/files.py
+++ b/scripts_of/files.py
@@ -795,7 +795,7 @@ def InitialiseFileHandler(options, fastaDir=None, continuationDir=None, resultsD
     
     Implementation
     1. Working out if an old directory structure is being used
-    2. Construct and apporpriate PreviousFilesLocator if necessary - this locates all required files
+    2. Construct and appropriate PreviousFilesLocator if necessary - this locates all required files
     3. Pass this to FileHandler - this creates the directory structure required for this run
     4. if error: print and exit
     5. Return FileHandler
diff --git a/scripts_of/parallel_task_manager.py b/scripts_of/parallel_task_manager.py
index c3be28a3..0019bdb5 100644
--- a/scripts_of/parallel_task_manager.py
+++ b/scripts_of/parallel_task_manager.py
@@ -355,7 +355,7 @@ def Fail():
     sys.stderr.flush()
     ptm = ParallelTaskManager_singleton()
     ptm.Stop()
-    print("ERROR: An error occurred, please review error messages for more information.")
+    print("ERROR: An error occurred, ***please review the error messages*** they may contain useful information about the problem.")
     sys.exit(1)
 
 
diff --git a/scripts_of/trees2ologs_of.py b/scripts_of/trees2ologs_of.py
index c97b5513..629cecd7 100644
--- a/scripts_of/trees2ologs_of.py
+++ b/scripts_of/trees2ologs_of.py
@@ -784,62 +784,69 @@ def __exit__(self, type, value, traceback):
 def DoOrthologuesForOrthoFinder(ogSet, species_tree_rooted_labelled, GeneToSpecies, all_stride_dup_genes, qNoRecon, hog_writer):   
     """
     """
-     # Create directory structure
-    speciesDict = ogSet.SpeciesDict()
-    SequenceDict = ogSet.SequenceDict()
-    # Write directory and file structure
-    qInitialisedSuspectGenesDirs = False
-    speciesIDs = ogSet.speciesToUse
-    nspecies = len(speciesIDs)      
-    dResultsOrthologues = files.FileHandler.GetOrthologuesDirectory()
-    for index1 in xrange(nspecies):
-        d = dResultsOrthologues + "Orthologues_" + speciesDict[str(speciesIDs[index1])] + "/"
-        if not os.path.exists(d): os.mkdir(d)     
-        for index2 in xrange(nspecies):
-            if index2 == index1: continue
-            with open(d + '%s__v__%s.tsv' % (speciesDict[str(speciesIDs[index1])], speciesDict[str(speciesIDs[index2])]), csv_write_mode) as outfile:
-                writer1 = csv.writer(outfile, delimiter="\t")
-                writer1.writerow(("Orthogroup", speciesDict[str(speciesIDs[index1])], speciesDict[str(speciesIDs[index2])]))
-    neighbours = GetSpeciesNeighbours(species_tree_rooted_labelled)
-    # Infer orthologues and write them to file           
-    nOgs = len(ogSet.OGs())
-    nOrthologues_SpPair = util.nOrtho_sp(nspecies) 
-    species = list(speciesDict.keys())
-    reconTreesRenamedDir = files.FileHandler.GetOGsReconTreeDir(True)
-    spec_seq_dict = ogSet.Spec_SeqDict()
-    sp_to_index = {str(sp):i for i, sp in enumerate(ogSet.speciesToUse)}
-    with open(files.FileHandler.GetDuplicationsFN(), csv_write_mode) as outfile, OrthologsFiles(dResultsOrthologues, speciesDict, ogSet.speciesToUse, nspecies, sp_to_index) as (ortholog_file_writers, suspect_genes_file_writers):
-        dupWriter = csv.writer(outfile, delimiter="\t")
-        dupWriter.writerow(["Orthogroup", "Species Tree Node", "Gene Tree Node", "Support", "Type",	"Genes 1", "Genes 2"])
-        for iog in range(nOgs):
-            rooted_tree_ids, qHaveSupport = CheckAndRootTree(files.FileHandler.GetOGsTreeFN(iog), species_tree_rooted_labelled, GeneToSpecies) # this can be parallelised easily
-            if rooted_tree_ids is None: continue
-            # Write rooted tree with accessions
-            util.RenameTreeTaxa(rooted_tree_ids, files.FileHandler.GetOGsTreeFN(iog, True), spec_seq_dict, qSupport=qHaveSupport, qFixNegatives=True, qViaCopy=True)
-            orthologues, recon_tree, suspect_genes = GetOrthologues_from_tree(iog, rooted_tree_ids, species_tree_rooted_labelled, GeneToSpecies, neighbours, dupsWriter=dupWriter, seqIDs=spec_seq_dict, spIDs=ogSet.SpeciesDict(), all_stride_dup_genes=all_stride_dup_genes, qNoRecon=qNoRecon)
-            GetHOGs_from_tree(iog, recon_tree, hog_writer)
-            qContainsSuspectGenes = len(suspect_genes) > 0
-            if (not qInitialisedSuspectGenesDirs) and qContainsSuspectGenes:
-                qInitialisedSuspectGenesDirs = True
-                dSuspectGenes = files.FileHandler.GetSuspectGenesDir()
-                dSuspectOrthologues = files.FileHandler.GetPutativeXenelogsDir()
-                for index1 in xrange(nspecies):
-                    with open(dSuspectOrthologues + '%s.tsv' % speciesDict[str(speciesIDs[index1])], csv_write_mode) as outfile:
-                        writer1 = csv.writer(outfile, delimiter="\t")
-                        writer1.writerow(("Orthogroup", speciesDict[str(speciesIDs[index1])], "Other"))
-            for index0 in xrange(nspecies):
-                strsp0 = species[index0]
-                strsp0_ = strsp0+"_"
-                these_genes = [g for g in suspect_genes if g.startswith(strsp0_)]
-                if len(these_genes) > 0:
-                    with open(dSuspectGenes + speciesDict[strsp0] + ".txt", 'a') as outfile:
-                        outfile.write("\n".join([SequenceDict[g] for g in these_genes]) + "\n")
-            allOrthologues = [(iog, orthologues)]
-            # don't relabel nodes, they've already been done
-            util.RenameTreeTaxa(recon_tree, reconTreesRenamedDir + "OG%07d_tree.txt" % iog, spec_seq_dict, qSupport=False, qFixNegatives=True)
-            if iog >= 0 and divmod(iog, 10 if nOgs <= 200 else 100 if nOgs <= 2000 else 1000)[1] == 0:
-                util.PrintTime("Done %d of %d" % (iog, nOgs))
-            nOrthologues_SpPair += AppendOrthologuesToFiles(allOrthologues, speciesDict, ogSet.speciesToUse, SequenceDict, dResultsOrthologues, ortholog_file_writers, suspect_genes_file_writers, qContainsSuspectGenes)
+    try:
+        # Create directory structure
+        speciesDict = ogSet.SpeciesDict()
+        SequenceDict = ogSet.SequenceDict()
+        # Write directory and file structure
+        qInitialisedSuspectGenesDirs = False
+        speciesIDs = ogSet.speciesToUse
+        nspecies = len(speciesIDs)      
+        dResultsOrthologues = files.FileHandler.GetOrthologuesDirectory()
+        for index1 in xrange(nspecies):
+            d = dResultsOrthologues + "Orthologues_" + speciesDict[str(speciesIDs[index1])] + "/"
+            if not os.path.exists(d): os.mkdir(d)     
+            for index2 in xrange(nspecies):
+                if index2 == index1: continue
+                with open(d + '%s__v__%s.tsv' % (speciesDict[str(speciesIDs[index1])], speciesDict[str(speciesIDs[index2])]), csv_write_mode) as outfile:
+                    writer1 = csv.writer(outfile, delimiter="\t")
+                    writer1.writerow(("Orthogroup", speciesDict[str(speciesIDs[index1])], speciesDict[str(speciesIDs[index2])]))
+        neighbours = GetSpeciesNeighbours(species_tree_rooted_labelled)
+        # Infer orthologues and write them to file           
+        nOgs = len(ogSet.OGs())
+        nOrthologues_SpPair = util.nOrtho_sp(nspecies) 
+        species = list(speciesDict.keys())
+        reconTreesRenamedDir = files.FileHandler.GetOGsReconTreeDir(True)
+        spec_seq_dict = ogSet.Spec_SeqDict()
+        sp_to_index = {str(sp):i for i, sp in enumerate(ogSet.speciesToUse)}
+        with open(files.FileHandler.GetDuplicationsFN(), csv_write_mode) as outfile, OrthologsFiles(dResultsOrthologues, speciesDict, ogSet.speciesToUse, nspecies, sp_to_index) as (ortholog_file_writers, suspect_genes_file_writers):
+            dupWriter = csv.writer(outfile, delimiter="\t")
+            dupWriter.writerow(["Orthogroup", "Species Tree Node", "Gene Tree Node", "Support", "Type",	"Genes 1", "Genes 2"])
+            for iog in range(nOgs):
+                rooted_tree_ids, qHaveSupport = CheckAndRootTree(files.FileHandler.GetOGsTreeFN(iog), species_tree_rooted_labelled, GeneToSpecies) # this can be parallelised easily
+                if rooted_tree_ids is None: continue
+                # Write rooted tree with accessions
+                util.RenameTreeTaxa(rooted_tree_ids, files.FileHandler.GetOGsTreeFN(iog, True), spec_seq_dict, qSupport=qHaveSupport, qFixNegatives=True, qViaCopy=True)
+                orthologues, recon_tree, suspect_genes = GetOrthologues_from_tree(iog, rooted_tree_ids, species_tree_rooted_labelled, GeneToSpecies, neighbours, dupsWriter=dupWriter, seqIDs=spec_seq_dict, spIDs=ogSet.SpeciesDict(), all_stride_dup_genes=all_stride_dup_genes, qNoRecon=qNoRecon)
+                GetHOGs_from_tree(iog, recon_tree, hog_writer)
+                qContainsSuspectGenes = len(suspect_genes) > 0
+                if (not qInitialisedSuspectGenesDirs) and qContainsSuspectGenes:
+                    qInitialisedSuspectGenesDirs = True
+                    dSuspectGenes = files.FileHandler.GetSuspectGenesDir()
+                    dSuspectOrthologues = files.FileHandler.GetPutativeXenelogsDir()
+                    for index1 in xrange(nspecies):
+                        with open(dSuspectOrthologues + '%s.tsv' % speciesDict[str(speciesIDs[index1])], csv_write_mode) as outfile:
+                            writer1 = csv.writer(outfile, delimiter="\t")
+                            writer1.writerow(("Orthogroup", speciesDict[str(speciesIDs[index1])], "Other"))
+                for index0 in xrange(nspecies):
+                    strsp0 = species[index0]
+                    strsp0_ = strsp0+"_"
+                    these_genes = [g for g in suspect_genes if g.startswith(strsp0_)]
+                    if len(these_genes) > 0:
+                        with open(dSuspectGenes + speciesDict[strsp0] + ".txt", 'a') as outfile:
+                            outfile.write("\n".join([SequenceDict[g] for g in these_genes]) + "\n")
+                allOrthologues = [(iog, orthologues)]
+                # don't relabel nodes, they've already been done
+                util.RenameTreeTaxa(recon_tree, reconTreesRenamedDir + "OG%07d_tree.txt" % iog, spec_seq_dict, qSupport=False, qFixNegatives=True)
+                if iog >= 0 and divmod(iog, 10 if nOgs <= 200 else 100 if nOgs <= 2000 else 1000)[1] == 0:
+                    util.PrintTime("Done %d of %d" % (iog, nOgs))
+                nOrthologues_SpPair += AppendOrthologuesToFiles(allOrthologues, speciesDict, ogSet.speciesToUse, SequenceDict, dResultsOrthologues, ortholog_file_writers, suspect_genes_file_writers, qContainsSuspectGenes)
+    except IOError as e:
+        if str(e).startswith("[Errno 24] Too many open files"):
+            util.number_open_files_exception_advice(len(ogSet.speciesToUse), True)
+            util.Fail()
+        else:
+            raise
     return nOrthologues_SpPair
 
 
diff --git a/scripts_of/util.py b/scripts_of/util.py
index e1e5c978..79e692b6 100644
--- a/scripts_of/util.py
+++ b/scripts_of/util.py
@@ -378,7 +378,37 @@ def FlowText(text, n=60):
             lines += text
             text = ""
     return lines
-    
+
+def number_open_files_exception_advice(n_species, q_at_trees):
+    """
+    Prints advice for user on "IOError: [Errno 24] Too many open files" exception
+    Args:
+        n_species - the number of species in the analysis
+        q_at_trees - has this error occurred at the orthologs from trees stage
+    """
+    # parallel_task_manager.RunCommand("ulimit -Hn")    
+    n_req = n_species*n_species + 100
+    msg="\nERROR: The system limits on the number of files a process can open is too low. For %d species \
+OrthoFinder needs to be able to open at least r=%d files. Please increase the limit and restart OrthoFinder\n\
+1. Check the hard and soft limits on the number of open files for your system:\n\
+    $ ulimit -Hn\n\
+    $ ulimit -Sn\n\
+2. If hard limit, h > r already, then you just need to increase the soft limit:\n\
+    $ ulimit -n %d\n\
+3. Alternatively, if h < r then you need to edit the file '/etc/security/limits.conf', this requires root privileges. \
+To increase the limit to %d for user  called 'emms' add the lines:\n\
+    emms hard nofile %d\n\
+    emms soft nofile %d\n" % (n_species, n_req, n_req, n_req, n_req, n_req)   
+    msg +="    (edit these lines to match your username)\n\
+4. Check the limit has now been updated (if you changed the hard limit you'll need to open a new session and confirm it's updated):\n\
+    $ ulimit -Sn" 
+
+    if q_at_trees:
+        msg_part_2 = "5. Once the limit is updated restart OrthoFinder 'from trees' using the '-ft' command"
+    else:
+        msg_part_2 = "5. Once the limit is updated restart OrthoFinder with the original command"
+    msg_part_3 = "\nFor full details see: https://github.com/davidemms/OrthoFinder/issues/384"
+    print(msg + "\n" + msg_part_2 + "\n" + msg_part_3 + "\n")
 """
 -------------------------------------------------------------------------------
 """