From 4c0900542349e24a2a82518a7db6d9a245a5e070 Mon Sep 17 00:00:00 2001
From: Victor Lin <13424970+victorlin@users.noreply.github.com>
Date: Fri, 8 Dec 2023 16:18:22 -0800
Subject: [PATCH] Deprecate read_strains from utils, move to io

---
 DEPRECATED.md                         | 10 ++++++++++
 augur/filter/include_exclude_rules.py |  2 +-
 augur/io/__init__.py                  |  1 +
 augur/io/strains.py                   | 27 +++++++++++++++++++++++++++
 augur/utils.py                        |  5 +++++
 tests/io/test_strains.py              | 19 +++++++++++++++++++
 tests/test_utils.py                   | 15 ---------------
 7 files changed, 63 insertions(+), 16 deletions(-)
 create mode 100644 augur/io/strains.py
 create mode 100644 tests/io/test_strains.py

diff --git a/DEPRECATED.md b/DEPRECATED.md
index 660617e28..96ec34136 100644
--- a/DEPRECATED.md
+++ b/DEPRECATED.md
@@ -11,3 +11,13 @@ January 2024 or after.*
 
 `augur export v2` was introduced in Augur version 6.0.0. Migrate by following
 the [official guide](https://docs.nextstrain.org/projects/augur/page/releases/migrating-v5-v6.html).
+
+## `augur.utils.read_strains`
+
+*Deprecated December 2023. Planned for removal March 2024 or after.*
+
+This is part of a [larger effort](https://github.com/nextstrain/augur/issues/1011)
+to formalize Augur's Python API.
+
+We recognize the existing usage of this function, so it has been moved to
+`augur.io.read_strains`.
diff --git a/augur/filter/include_exclude_rules.py b/augur/filter/include_exclude_rules.py
index 9889ba2e6..9ea1e03e8 100644
--- a/augur/filter/include_exclude_rules.py
+++ b/augur/filter/include_exclude_rules.py
@@ -9,8 +9,8 @@
 from augur.errors import AugurError
 from augur.io.metadata import METADATA_DATE_COLUMN
 from augur.io.print import print_err
+from augur.io.strains import read_strains
 from augur.io.vcf import is_vcf as filename_is_vcf
-from augur.utils import read_strains
 from . import constants
 
 try:
diff --git a/augur/io/__init__.py b/augur/io/__init__.py
index 685f9ff78..4a721bffb 100644
--- a/augur/io/__init__.py
+++ b/augur/io/__init__.py
@@ -5,3 +5,4 @@
 from .file import open_file  # noqa: F401
 from .metadata import read_metadata  # noqa: F401
 from .sequences import read_sequences, write_sequences  # noqa: F401
+from .strains import read_strains  # noqa: F401
diff --git a/augur/io/strains.py b/augur/io/strains.py
new file mode 100644
index 000000000..170212633
--- /dev/null
+++ b/augur/io/strains.py
@@ -0,0 +1,27 @@
+from augur.utils import read_entries
+
+
+def read_strains(*files, comment_char="#"):
+    """Reads strain names from one or more plain text files and returns the
+    set of distinct strains.
+
+    Strain names can be commented with full-line or inline comments. For
+    example, the following is a valid strain names file::
+
+        # this is a comment at the top of the file
+        strain1  # exclude strain1 because it isn't sequenced properly
+        strain2
+          # this is an empty line that will be ignored.
+
+    Parameters
+    ----------
+    files : iterable of str
+        one or more names of text files with one strain name per line
+
+    Returns
+    -------
+    set :
+        strain names from the given input files
+
+    """
+    return set(read_entries(*files, comment_char=comment_char))
diff --git a/augur/utils.py b/augur/utils.py
index b0790ab2c..f759a7df8 100644
--- a/augur/utils.py
+++ b/augur/utils.py
@@ -5,10 +5,12 @@
 import os, json, sys
 import pandas as pd
 from collections import defaultdict, OrderedDict
+from textwrap import dedent
 from .__version__ import __version__
 
 from augur.data import as_file
 from augur.io.file import open_file
+from augur.io.print import print_err
 
 from augur.types import ValidationMode
 from augur.errors import AugurError
@@ -738,6 +740,9 @@ def load_mask_sites(mask_file):
 
 
 def read_strains(*files, comment_char="#"):
+    print_err(dedent("""
+        DEPRECATION WARNING: augur.utils.read_strains is no longer maintained and will be removed in the future.
+        Please use augur.io.read_strains instead."""))
     return set(read_entries(*files, comment_char=comment_char))
 
 
diff --git a/tests/io/test_strains.py b/tests/io/test_strains.py
new file mode 100644
index 000000000..dfd4ffe8b
--- /dev/null
+++ b/tests/io/test_strains.py
@@ -0,0 +1,19 @@
+from pathlib import Path
+
+from augur.io.strains import read_strains
+
+
+def test_read_strains(tmpdir):
+    # Write one list of filenames with some unnecessary whitespace.
+    strains1 = Path(tmpdir) / Path("strains1.txt")
+    with open(strains1, "w") as oh:
+        oh.write("strain1 # this is an inline comment about strain 1\nstrain2\n   # this is a comment preceded by whitespace.\n")
+
+    # Write another list of filenames with a comment.
+    strains2 = Path(tmpdir) / Path("strains2.txt")
+    with open(strains2, "w") as oh:
+        oh.write("# this is a comment. ignore this.\nstrain2\nstrain3\n")
+
+    strains = read_strains(strains1, strains2)
+    assert len(strains) == 3
+    assert "strain1" in strains
diff --git a/tests/test_utils.py b/tests/test_utils.py
index e4661a249..a1183cf58 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -79,21 +79,6 @@ def test_read_mask_file_drm_file(self, tmpdir):
             fh.write("\n".join(drm_lines))
         assert utils.read_mask_file(drm_file) == expected_sites
 
-    def test_read_strains(self, tmpdir):
-        # Write one list of filenames with some unnecessary whitespace.
-        strains1 = Path(tmpdir) / Path("strains1.txt")
-        with open(strains1, "w") as oh:
-            oh.write("strain1 # this is an inline comment about strain 1\nstrain2\n   # this is a comment preceded by whitespace.\n")
-
-        # Write another list of filenames with a comment.
-        strains2 = Path(tmpdir) / Path("strains2.txt")
-        with open(strains2, "w") as oh:
-            oh.write("# this is a comment. ignore this.\nstrain2\nstrain3\n")
-
-        strains = utils.read_strains(strains1, strains2)
-        assert len(strains) == 3
-        assert "strain1" in strains
-
     def test_write_json_data_types(self, tmpdir):
         """write_json should be able to serialize various data types."""
         data = {