diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst index 64e064ea..49bde193 100644 --- a/CONTRIBUTING.rst +++ b/CONTRIBUTING.rst @@ -87,7 +87,7 @@ For merging, you should: 1. Ensure tests pass. 2. Update documentation when there's new API, functionality, etc. -3. Add yourself to ``CONTRIBUTORS.rst``. +3. Add yourself to ``CONTRIBUTORS.rst`` if you'd like. Documentation ------------- diff --git a/CONTRIBUTORS.rst b/CONTRIBUTORS.rst index 8db2cb16..670259e5 100644 --- a/CONTRIBUTORS.rst +++ b/CONTRIBUTORS.rst @@ -31,6 +31,7 @@ Alan Moffet `@amoffet`_ Anatoli Babenia `@abitrolly`_ Castedo Ellerman `@castedo`_ Gerard Manning `@GerardManning`_ +Julian Runnels `@JulianRunnels`_ Kevin Arvai `@arvkevi`_ Phil Palmer `@PhilPalmer`_ Yoan Bouzin @@ -40,5 +41,6 @@ Yoan Bouzin .. _@abitrolly: https://github.com/abitrolly .. _@castedo: https://github.com/castedo .. _@GerardManning: https://github.com/GerardManning +.. _@JulianRunnels: https://github.com/JulianRunnels .. _@arvkevi: https://github.com/arvkevi .. _@PhilPalmer: https://github.com/PhilPalmer diff --git a/README.rst b/README.rst index fc82835e..76171cbb 100644 --- a/README.rst +++ b/README.rst @@ -156,12 +156,12 @@ As the data gets added, it's compared to the existing data, and SNP position and discrepancies are identified. (The discrepancy thresholds can be tuned via parameters.) These discrepant SNPs are available for inspection after the merge via properties of the ``SNPs`` object. -Additionally, any non-called / null genotypes will be updated during the merge, if the file -being merged has a called genotype for the SNP. - >>> len(s.discrepant_merge_genotypes) 151 +Additionally, any non-called / null genotypes will be updated during the merge, if the file +being merged has a called genotype for the SNP. + Finally, ``merge`` returns a list of ``dict``, where each ``dict`` has information corresponding to the results of each merge (e.g., SNPs in common). diff --git a/analysis/parse-opensnp-files/parse_opensnp_files.py b/analysis/parse-opensnp-files/parse_opensnp_files.py index 770898dd..c2bb56b4 100644 --- a/analysis/parse-opensnp-files/parse_opensnp_files.py +++ b/analysis/parse-opensnp-files/parse_opensnp_files.py @@ -28,7 +28,7 @@ # setup logger to output to file in output directory logging.basicConfig( - filename="{}".format(os.path.join(OUTPUT_DIR, "parse-opensnp-files.txt")), + filename=f'{os.path.join(OUTPUT_DIR, "parse-opensnp-files.txt")}', format="%(asctime)s: %(message)s", filemode="w", level=logging.INFO, @@ -90,12 +90,10 @@ def main(): # log parsing statistics file_count = len(filenames) - logger.info("{} files in the openSNP datadump".format(file_count)) - logger.info("{:.2%} of openSNP datadump files parsed".format(len(df) / file_count)) + logger.info(f"{file_count} files in the openSNP datadump") + logger.info(f"{(len(df) / file_count):.2%} of openSNP datadump files parsed") logger.info( - "build detected in {:.2%} of files parsed".format( - len(df.loc[df.build_detected]) / len(df) - ) + f"build detected in {len(df.loc[df.build_detected]) / len(df):.2%} of files parsed" ) # extract files from the datadump where `load_file` returned a message @@ -118,9 +116,7 @@ def main(): continue # create a directory for each message (prefix indicates number of files) - path = os.path.join( - OUTPUT_DIR, "{:04}_{}".format(len(files), clean_str(msg)) - ) + path = os.path.join(OUTPUT_DIR, f"{len(files):04}_{clean_str(msg)}") create_dir(path) # save each file with message into created directory for filename in files: diff --git a/analysis/xy-chrom-snp-ratios/xy-chrom-snp-ratios.py b/analysis/xy-chrom-snp-ratios/xy-chrom-snp-ratios.py index af4f9c8c..bf8322df 100644 --- a/analysis/xy-chrom-snp-ratios/xy-chrom-snp-ratios.py +++ b/analysis/xy-chrom-snp-ratios/xy-chrom-snp-ratios.py @@ -22,7 +22,7 @@ # setup logger to output to file in output directory logging.basicConfig( - filename="{}".format(os.path.join(OUTPUT_DIR, "xy-chrom-snp-ratios.txt")), + filename=f"{os.path.join(OUTPUT_DIR, 'xy-chrom-snp-ratios.txt')}", format="%(asctime)s#%(message)s", filemode="w", level=logging.INFO, @@ -35,10 +35,10 @@ def get_xy_chrom_snp_ratios(task): file = task["file"] try: - logger.info("loading {}".format(file)) + logger.info(f"loading {file}") s = SNPs(r.load_opensnp_datadump_file(file), assign_par_snps=False) except Exception as err: - logger.error("{}#{}".format(file, err)) + logger.error(f"{file}#{err}") return None try: @@ -72,10 +72,10 @@ def get_xy_chrom_snp_ratios(task): s.count, ] else: - logger.info("{}#{}".format(file, "no SNPs processed")) + logger.info(f"{file}#{'no SNPs processed'}") except Exception as err: - logger.error("{}#{}".format(file, err)) + logger.error(f"{file}#{err}") return None @@ -99,9 +99,7 @@ def create_analysis_plot( # start with a rectangular Figure fig = plt.figure(figsize=(8, 8)) - fig.suptitle( - "Analysis of openSNP datadump XY chrom SNP ratios; N = {}".format(len(df)) - ) + fig.suptitle(f"Analysis of openSNP datadump XY chrom SNP ratios; N = {len(df)}") ax_scatter = plt.axes(rect_scatter) ax_scatter.tick_params(direction="in", top=True, right=True) @@ -127,12 +125,12 @@ def create_analysis_plot( heterozygous_x_snps_threshold_line = ax_scatter.axvline( x=heterozygous_x_snps_threshold, c="blue", - label="Het. X threshold={}".format(heterozygous_x_snps_threshold), + label=f"Het. X threshold={heterozygous_x_snps_threshold}", ) y_snps_not_null_threshold_line = ax_scatter.axhline( y=y_snps_not_null_threshold, c="red", - label="Y not null threshold={}".format(y_snps_not_null_threshold), + label=f"Y not null threshold={y_snps_not_null_threshold}", ) # fill genotype areas @@ -178,17 +176,7 @@ def create_analysis_plot( x_offset = lim_x * 0.01 y_offset = lim_y * 0.01 ax_scatter.annotate( - "n={}".format( - len( - df_ratios.loc[ - ( - df_ratios.heterozygous_x_snps_ratio - < heterozygous_x_snps_threshold - ) - & (df_ratios.y_snps_not_null_ratio < y_snps_not_null_threshold) - ] - ) - ), + f"n={len(df_ratios.loc[(df_ratios.heterozygous_x_snps_ratio < heterozygous_x_snps_threshold) & (df_ratios.y_snps_not_null_ratio < y_snps_not_null_threshold)])}", ( heterozygous_x_snps_threshold - x_offset, y_snps_not_null_threshold - y_offset, @@ -197,17 +185,7 @@ def create_analysis_plot( va="top", ) ax_scatter.annotate( - "n={}".format( - len( - df_ratios.loc[ - ( - df_ratios.heterozygous_x_snps_ratio - < heterozygous_x_snps_threshold - ) - & (df_ratios.y_snps_not_null_ratio >= y_snps_not_null_threshold) - ] - ) - ), + f"n={ len(df_ratios.loc[(df_ratios.heterozygous_x_snps_ratio < heterozygous_x_snps_threshold)& (df_ratios.y_snps_not_null_ratio >= y_snps_not_null_threshold)])}", ( heterozygous_x_snps_threshold - x_offset, y_snps_not_null_threshold + y_offset, @@ -216,17 +194,7 @@ def create_analysis_plot( va="bottom", ) ax_scatter.annotate( - "n={}".format( - len( - df_ratios.loc[ - ( - df_ratios.heterozygous_x_snps_ratio - >= heterozygous_x_snps_threshold - ) - & (df_ratios.y_snps_not_null_ratio >= y_snps_not_null_threshold) - ] - ) - ), + f"n={len(df_ratios.loc[ (df_ratios.heterozygous_x_snps_ratio >= heterozygous_x_snps_threshold) & (df_ratios.y_snps_not_null_ratio >= y_snps_not_null_threshold)])}", ( heterozygous_x_snps_threshold + x_offset, y_snps_not_null_threshold + y_offset, @@ -235,17 +203,7 @@ def create_analysis_plot( va="bottom", ) ax_scatter.annotate( - "n={}".format( - len( - df_ratios.loc[ - ( - df_ratios.heterozygous_x_snps_ratio - >= heterozygous_x_snps_threshold - ) - & (df_ratios.y_snps_not_null_ratio < y_snps_not_null_threshold) - ] - ) - ), + f"n={len(df_ratios.loc[(df_ratios.heterozygous_x_snps_ratio >= heterozygous_x_snps_threshold) & (df_ratios.y_snps_not_null_ratio < y_snps_not_null_threshold)])}", ( heterozygous_x_snps_threshold + x_offset, y_snps_not_null_threshold - y_offset, @@ -307,7 +265,7 @@ def create_analysis_plot( # save output with atomic_write( - "{}".format(os.path.join(OUTPUT_DIR, "xy-chrom-snp-ratios.png")), + f"{os.path.join(OUTPUT_DIR, 'xy-chrom-snp-ratios.png')}", mode="wb", overwrite=True, ) as f: diff --git a/docs/index.rst b/docs/index.rst index 1948f276..9dfa6878 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -14,6 +14,7 @@ README output_files + installation snps_banner changelog contributing diff --git a/docs/installation.rst b/docs/installation.rst new file mode 100644 index 00000000..2705fac7 --- /dev/null +++ b/docs/installation.rst @@ -0,0 +1,83 @@ +Installation +============ + +``snps`` is `available `_ on the +`Python Package Index `_. Install ``snps`` (and its required +Python dependencies) via ``pip``:: + + $ pip install snps + +Installation and Usage on a Raspberry Pi +---------------------------------------- +The instructions below provide the steps to install ``snps`` on a +`Raspberry Pi `_ (tested with +"`Raspberry Pi OS `_ (32-bit) Lite", +release date 2020-08-20). For more details about Python on the Raspberry Pi, see +`here `_. + +.. note:: Text after a prompt (e.g., ``$``) is the command to type at the command line. The + instructions assume a fresh install of Raspberry Pi OS and that after logging in as + the ``pi`` user, the current working directory is ``/home/pi``. + +1. Install ``pip`` for Python 3:: + + pi@raspberrypi:~ $ sudo apt install python3-pip + + Press "y" followed by "enter" to continue. This enables us to install packages from the + Python Package Index. + +2. Install the ``venv`` module:: + + pi@raspberrypi:~ $ sudo apt install python3-venv + + Press "y" followed by "enter" to continue. This enables us to create a + `virtual environment `_ to isolate the ``snps`` + installation from other system Python packages. + +3. `Install ATLAS `_:: + + pi@raspberrypi:~ $ sudo apt install libatlas-base-dev + + Press "y" followed by "enter" to continue. This is required for `NumPy `_, a + dependency of ``snps``. + +4. Create a directory for ``snps`` and change working directory:: + + pi@raspberrypi:~ $ mkdir snps + pi@raspberrypi:~ $ cd snps + +5. Create a virtual environment for ``snps``:: + + pi@raspberrypi:~/snps $ python3 -m venv .venv + + The virtual environment is located at ``/home/pi/snps/.venv``. + +6. Activate the virtual environment:: + + pi@raspberrypi:~/snps $ source .venv/bin/activate + + Now when you invoke Python or ``pip``, the virtual environment's version will be used (as + indicated by the ``(.venv)`` before the prompt). This can be verified as follows:: + + (.venv) pi@raspberrypi:~/snps $ which python + /home/pi/snps/.venv/bin/python + +7. Install ``snps``:: + + (.venv) pi@raspberrypi:~/snps $ pip install snps + +8. Start Python:: + + (.venv) pi@raspberrypi:~/snps $ python + Python 3.7.3 (default, Jul 25 2020, 13:03:44) + [GCC 8.3.0] on linux + Type "help", "copyright", "credits" or "license" for more information. + >>> + +9. Use ``snps``; examples shown in the README should now work. + +10. At completion of usage, the virtual environment can be deactivated:: + + (.venv) pi@raspberrypi:~/snps $ deactivate + pi@raspberrypi:~/snps $ + diff --git a/src/snps/ensembl.py b/src/snps/ensembl.py index 03bbdb67..35a322d7 100644 --- a/src/snps/ensembl.py +++ b/src/snps/ensembl.py @@ -116,9 +116,7 @@ def perform_rest_action(self, endpoint, hdrs=None, params=None): self.perform_rest_action(endpoint, hdrs, params) else: sys.stderr.write( - "Request failed for {0}: Status code: {1.code} Reason: {1.reason}\n".format( - endpoint, e - ) + f"Request failed for {endpoint}: Status code: {e.code} Reason: {e.reason}\n" ) return data diff --git a/src/snps/io/reader.py b/src/snps/io/reader.py index 68d07057..dfcd9401 100644 --- a/src/snps/io/reader.py +++ b/src/snps/io/reader.py @@ -268,11 +268,7 @@ def _detect_build_from_comments(self, comments, source): # allow more variations for VCF if source == "vcf": if "https://pypi.org/project/snps/" in comments: # remove `snps` version - comments = "{}{}".format( - comments[: comments.find("snps v")], - comments[comments.find("https://pypi.org/project/snps/") :], - ) - + comments = f"{comments[: comments.find('snps v')]}{comments[comments.find('https://pypi.org/project/snps/'):]}" if "hg19" in comments: return 37 elif "ncbi36" in comments: @@ -827,9 +823,7 @@ def map_pos(x): df["rsid"] = df["SNP Name"].apply(map_rsids) df["chrom"] = df["SNP Name"].apply(map_chr) df["pos"] = df["SNP Name"].apply(map_pos) - df["genotype"] = ( - df["Allele1 - {}".format(strand)] + df["Allele2 - {}".format(strand)] - ) + df["genotype"] = df[f"Allele1 - {strand}"] + df[f"Allele2 - {strand}"] df.dropna(subset=["rsid", "chrom", "pos"], inplace=True) df = df.astype(NORMALIZED_DTYPES) @@ -1183,7 +1177,7 @@ def _parse_vcf(self, buffer, rsids): record_array = [ rsid, - "{}".format(line_split[0]).strip("chr"), + f"{line_split[0]}".strip("chr"), line_split[1], genotype, ] diff --git a/src/snps/io/writer.py b/src/snps/io/writer.py index f32ab6d6..2e7905c9 100644 --- a/src/snps/io/writer.py +++ b/src/snps/io/writer.py @@ -120,24 +120,15 @@ def _write_csv(self): if "sep" in self._kwargs and self._kwargs["sep"] == ",": ext = ".csv" - filename = "{}_{}{}".format( - clean_str(self._snps.source), self._snps.assembly, ext - ) + filename = f"{clean_str(self._snps.source)}_{self._snps.assembly}{ext}" comment = ( - "# Source(s): {}\n" - "# Build: {}\n" - "# Build Detected: {}\n" - "# Phased: {}\n" - "# SNPs: {}\n" - "# Chromosomes: {}\n".format( - self._snps.source, - self._snps.build, - self._snps.build_detected, - self._snps.phased, - self._snps.count, - self._snps.chromosomes_summary, - ) + f"# Source(s): {self._snps.source}\n" + f"# Build: {self._snps.build}\n" + f"# Build Detected: { self._snps.build_detected}\n" + f"# Phased: {self._snps.phased}\n" + f"# SNPs: {self._snps.count}\n" + f"# Chromosomes: {self._snps.chromosomes_summary}\n" ) if "header" in self._kwargs: if isinstance(self._kwargs["header"], bool): @@ -152,7 +143,7 @@ def _write_csv(self): filename, comment=comment, atomic=self._atomic, - **self._kwargs + **self._kwargs, ) def _write_vcf(self): @@ -172,18 +163,12 @@ def _write_vcf(self): """ filename = self._filename if not filename: - filename = "{}_{}{}".format( - clean_str(self._snps.source), self._snps.assembly, ".vcf" - ) + filename = f"{clean_str(self._snps.source)}_{self._snps.assembly}{'.vcf'}" comment = ( - "##fileformat=VCFv4.2\n" - "##fileDate={}\n" - '##source="{}; snps v{}; https://pypi.org/project/snps/"\n'.format( - datetime.datetime.utcnow().strftime("%Y%m%d"), - self._snps.source, - snps.__version__, - ) + f"##fileformat=VCFv4.2\n" + f'##fileDate={datetime.datetime.utcnow().strftime("%Y%m%d")}\n' + f'##source="{self._snps.source}; snps v{snps.__version__}; https://pypi.org/project/snps/"\n' ) reference_sequence_chroms = ( @@ -299,9 +284,7 @@ def _create_vcf_representation(self, task): seqs = resources.get_reference_sequences(assembly, [chrom]) seq = seqs[chrom] - contig = '##contig=\n'.format( - seq.ID, seq.url, seq.length, seq.build, seq.md5, seq.species - ) + contig = f'##contig=\n' snps = snps.reset_index() @@ -401,8 +384,8 @@ def _compute_genotype(self, ref, alt, genotype): alleles.extend(alt.split(",")) if len(genotype) == 2: - return "{}{}{}".format( - alleles.index(genotype[0]), separator, alleles.index(genotype[1]) + return ( + f"{alleles.index(genotype[0])}{separator}{alleles.index(genotype[1])}" ) else: - return "{}".format(alleles.index(genotype[0])) + return f"{alleles.index(genotype[0])}" diff --git a/src/snps/resources.py b/src/snps/resources.py index 486a6d4e..3fb7b6e6 100644 --- a/src/snps/resources.py +++ b/src/snps/resources.py @@ -453,15 +453,14 @@ def _get_paths_reference_sequences( return ("", [], [], []) filenames = [ - "Homo_sapiens.{}.{}dna.chromosome.{}.fa.gz".format(assembly, release, chrom) + f"Homo_sapiens.{assembly}.{release}dna.chromosome.{chrom}.fa.gz" for chrom in chroms ] - urls = ["{}{}".format(base, filename) for filename in filenames] + urls = [f"{base}{filename}" for filename in filenames] local_filenames = [ - "{}{}{}{}{}".format(sub_dir, os.sep, assembly, os.sep, filename) - for filename in filenames + f"{sub_dir}{os.sep}{assembly}{os.sep}{filename}" for filename in filenames ] return ( @@ -530,7 +529,7 @@ def _get_path_assembly_mapping_data( ) if not os.path.exists(destination): - logger.info("Downloading {}".format(os.path.relpath(destination))) + logger.info(f"Downloading {os.path.relpath(destination)}") self._download_assembly_mapping_data( destination, chroms, source_assembly, target_assembly, retries @@ -546,8 +545,8 @@ def _download_assembly_mapping_data( for chrom in chroms: file = chrom + ".json" - map_endpoint = "/map/human/{}/{}/{}?".format( - source_assembly, chrom, target_assembly + map_endpoint = ( + f"/map/human/{source_assembly}/{chrom}/{target_assembly}?" ) # get assembly mapping data @@ -665,7 +664,7 @@ def _download_file(self, url, filename, compress=False, timeout=30): timeout=timeout, ) except socket.timeout: - logger.warning("Timeout downloading {}".format(url)) + logger.warning(f"Timeout downloading {url}") destination = "" return destination @@ -679,7 +678,7 @@ def _print_download_msg(path): path : str path to file being downloaded """ - logger.info("Downloading {}".format(os.path.relpath(path))) + logger.info(f"Downloading {os.path.relpath(path)}") class ReferenceSequence: @@ -721,9 +720,7 @@ def __init__(self, ID="", url="", path="", assembly="", species="", taxonomy="") self._length = 0 def __repr__(self): - return "ReferenceSequence(assembly={!r}, ID={!r})".format( - self._assembly, self._ID - ) + return f"ReferenceSequence(assembly={self._assembly!r}, ID={self._ID!r})" @property def ID(self): @@ -784,7 +781,7 @@ def build(self): str e.g., "B37" """ - return "B{}".format(self._assembly[-2:]) + return f"B{self._assembly[-2:]}" @property def species(self): diff --git a/src/snps/snps.py b/src/snps/snps.py index 706811a4..8ed492a2 100644 --- a/src/snps/snps.py +++ b/src/snps/snps.py @@ -165,7 +165,7 @@ def __init__( logger.warning("no SNPs loaded...") def __repr__(self): - return "SNPs({!r})".format(self._file[0:50]) + return f"SNPs({self._file[0:50]!r})" @property def source(self): @@ -421,9 +421,9 @@ def chromosomes_summary(self): def as_range(iterable): l = list(iterable) if len(l) > 1: - return "{0}-{1}".format(l[0], l[-1]) + return f"{l[0]}-{l[-1]}" else: - return "{0}".format(l[0]) + return f"{l[0]}" # create str representations int_chroms = ", ".join( @@ -489,19 +489,13 @@ def heterozygous(self, chrom=""): pandas.DataFrame normalized ``snps`` dataframe """ - if chrom: - return self._snps.loc[ - (self._snps.chrom == chrom) - & (self._snps.genotype.notnull()) - & (self._snps.genotype.str.len() == 2) - & (self._snps.genotype.str[0] != self._snps.genotype.str[1]) - ] - else: - return self._snps.loc[ - (self._snps.genotype.notnull()) - & (self._snps.genotype.str.len() == 2) - & (self._snps.genotype.str[0] != self._snps.genotype.str[1]) - ] + df = self._filter(chrom) + + return df.loc[ + (df.genotype.notnull()) + & (df.genotype.str.len() == 2) + & (df.genotype.str[0] != df.genotype.str[1]) + ] def homozygous(self, chrom=""): """ Get homozygous SNPs. @@ -516,19 +510,13 @@ def homozygous(self, chrom=""): pandas.DataFrame normalized ``snps`` dataframe """ - if chrom: - return self._snps.loc[ - (self._snps.chrom == chrom) - & (self._snps.genotype.notnull()) - & (self._snps.genotype.str.len() == 2) - & (self._snps.genotype.str[0] == self._snps.genotype.str[1]) - ] - else: - return self._snps.loc[ - (self._snps.genotype.notnull()) - & (self._snps.genotype.str.len() == 2) - & (self._snps.genotype.str[0] == self._snps.genotype.str[1]) - ] + df = self._filter(chrom) + + return df.loc[ + (df.genotype.notnull()) + & (df.genotype.str.len() == 2) + & (df.genotype.str[0] == df.genotype.str[1]) + ] def notnull(self, chrom=""): """ Get not null genotype SNPs. @@ -543,13 +531,9 @@ def notnull(self, chrom=""): pandas.DataFrame normalized ``snps`` dataframe """ + df = self._filter(chrom) - if chrom: - return self._snps.loc[ - (self._snps.chrom == chrom) & (self._snps.genotype.notnull()) - ] - else: - return self._snps.loc[self._snps.genotype.notnull()] + return df.loc[df.genotype.notnull()] @property def summary(self): @@ -624,13 +608,14 @@ def save(self, filename="", vcf=False, atomic=True, **kwargs): self._discrepant_vcf_position = extra[0] self._discrepant_vcf_position.set_index("rsid", inplace=True) logger.warning( - "{} SNP positions were found to be discrepant when saving VCF".format( - len(self.discrepant_vcf_position) - ) + f"{len(self.discrepant_vcf_position)} SNP positions were found to be discrepant when saving VCF" ) return path + def _filter(self, chrom=""): + return self.snps.loc[self.snps.chrom == chrom] if chrom else self.snps + def _read_raw_data(self, file, only_detect_source, rsids): return Reader.read_file(file, only_detect_source, self._resources, rsids) @@ -683,10 +668,10 @@ def _lookup_refsnp_snapshot(self, rsid, rest_client): # this RefSnp id was merged into another # we'll pick the first one to decide which chromosome this PAR will be assigned to merged_id = "rs" + response["merged_snapshot_data"]["merged_into"][0] - logger.info("SNP id {} has been merged into id {}".format(rsid, merged_id)) + logger.info(f"SNP id {rsid} has been merged into id {merged_id}") return self._lookup_refsnp_snapshot(merged_id, rest_client) elif "nosnppos_snapshot_data" in response: - logger.warning("Unable to look up SNP id {}".format(rsid)) + logger.warning(f"Unable to look up SNP id {rsid}") return None else: return response @@ -816,10 +801,7 @@ def get_count(self, chrom=""): ------- int """ - if chrom: - return len(self._snps.loc[(self._snps.chrom == chrom)]) - else: - return len(self._snps) + return len(self._filter(chrom)) def determine_sex( self, @@ -881,26 +863,25 @@ def _get_non_par_start_stop(self, chrom): def _get_non_par_snps(self, chrom, heterozygous=True): np_start, np_stop = self._get_non_par_start_stop(chrom) + df = self._filter(chrom) if heterozygous: # get heterozygous SNPs in the non-PAR region (i.e., discrepant XY SNPs) - return self._snps.loc[ - (self._snps.chrom == chrom) - & (self._snps.genotype.notnull()) - & (self._snps.genotype.str.len() == 2) - & (self._snps.genotype.str[0] != self._snps.genotype.str[1]) - & (self._snps.pos > np_start) - & (self._snps.pos < np_stop) + return df.loc[ + (df.genotype.notnull()) + & (df.genotype.str.len() == 2) + & (df.genotype.str[0] != df.genotype.str[1]) + & (df.pos > np_start) + & (df.pos < np_stop) ].index else: # get homozygous SNPs in the non-PAR region - return self._snps.loc[ - (self._snps.chrom == chrom) - & (self._snps.genotype.notnull()) - & (self._snps.genotype.str.len() == 2) - & (self._snps.genotype.str[0] == self._snps.genotype.str[1]) - & (self._snps.pos > np_start) - & (self._snps.pos < np_stop) + return df.loc[ + (df.genotype.notnull()) + & (df.genotype.str.len() == 2) + & (df.genotype.str[0] == df.genotype.str[1]) + & (df.pos > np_start) + & (df.pos < np_stop) ].index def _deduplicate_rsids(self): @@ -1135,8 +1116,8 @@ def remap(self, target_assembly, complement_bases=True): ) else: logger.warning( - "Chromosome {} not remapped; " - "removing chromosome from SNPs for consistency".format(chrom) + f"Chromosome {chrom} not remapped; " + f"removing chromosome from SNPs for consistency" ) snps = snps.drop(snps.loc[snps["chrom"] == chrom].index) @@ -1322,16 +1303,12 @@ def ensure_same_build(s): # ensure builds match when merging if not s.build_detected: logger.warning( - "Build not detected for {}, assuming Build {}".format( - s.__repr__(), s.build - ) + f"Build not detected for {s.__repr__()}, assuming Build {s.build}" ) if self.build != s.build: logger.info( - "{} has Build {}; remapping to Build {}".format( - s.__repr__(), s.build, self.build - ) + f"{s.__repr__()} has Build {s.build}; remapping to Build {self.build}" ) s.remap(self.build) @@ -1413,16 +1390,12 @@ def merge_snps(s, positions_threshold, genotypes_threshold): if 0 < len(discrepant_positions) < positions_threshold: logger.warning( - "{} SNP positions were discrepant; keeping original positions".format( - str(len(discrepant_positions)) - ) + f"{str(len(discrepant_positions))} SNP positions were discrepant; keeping original positions" ) if 0 < len(discrepant_genotypes) < genotypes_threshold: logger.warning( - "{} SNP genotypes were discrepant; marking those as null".format( - str(len(discrepant_genotypes)) - ) + f"{str(len(discrepant_genotypes))} SNP genotypes were discrepant; marking those as null" ) # set discrepant genotypes to null @@ -1461,21 +1434,19 @@ def merge_snps(s, positions_threshold, genotypes_threshold): continue if not self.valid: - logger.info("Loading {}".format(snps_object.__repr__())) + logger.info(f"Loading {snps_object.__repr__()}") init(snps_object) d.update({"merged": True}) else: - logger.info("Merging {}".format(snps_object.__repr__())) + logger.info(f"Merging {snps_object.__repr__()}") if remap: ensure_same_build(snps_object) if self.build != snps_object.build: logger.warning( - "{} has Build {}; this SNPs object has Build {}".format( - snps_object.__repr__(), snps_object.build, self.build - ) + f"{snps_object.__repr__()} has Build {snps_object.build}; this SNPs object has Build {self.build}" ) merged, *extra = merge_snps( diff --git a/src/snps/utils.py b/src/snps/utils.py index 4d270c4b..eb71b2b7 100644 --- a/src/snps/utils.py +++ b/src/snps/utils.py @@ -160,15 +160,12 @@ def save_df_as_csv( destination = filename else: destination = os.path.join(path, filename) - logger.info("Saving {}".format(os.path.relpath(destination))) + logger.info(f"Saving {os.path.relpath(destination)}") if prepend_info: s = ( - "# Generated by snps v{}, https://pypi.org/project/snps/\n" - "# Generated at {} UTC\n".format( - snps.__version__, - datetime.datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S"), - ) + f"# Generated by snps v{snps.__version__}, https://pypi.org/project/snps/\n" + f'# Generated at {datetime.datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S")} UTC\n' ) else: s = "" diff --git a/tests/__init__.py b/tests/__init__.py index 69cbae2b..96cf92ab 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -329,14 +329,14 @@ def _get_test_assembly_mapping_data(self, source, target, strands, mappings): "strand": strands[0], "start": mappings[0], "end": mappings[0], - "assembly": "{}".format(source), + "assembly": f"{source}", }, "mapped": { "seq_region_name": "1", "strand": strands[1], "start": mappings[1], "end": mappings[1], - "assembly": "{}".format(target), + "assembly": f"{target}", }, }, { @@ -345,14 +345,14 @@ def _get_test_assembly_mapping_data(self, source, target, strands, mappings): "strand": strands[2], "start": mappings[2], "end": mappings[2], - "assembly": "{}".format(source), + "assembly": f"{source}", }, "mapped": { "seq_region_name": "1", "strand": strands[3], "start": mappings[3], "end": mappings[3], - "assembly": "{}".format(target), + "assembly": f"{target}", }, }, { @@ -361,14 +361,14 @@ def _get_test_assembly_mapping_data(self, source, target, strands, mappings): "strand": strands[4], "start": mappings[4], "end": mappings[4], - "assembly": "{}".format(source), + "assembly": f"{source}", }, "mapped": { "seq_region_name": "1", "strand": strands[5], "start": mappings[5], "end": mappings[5], - "assembly": "{}".format(target), + "assembly": f"{target}", }, }, ] @@ -381,14 +381,14 @@ def _get_test_assembly_mapping_data(self, source, target, strands, mappings): "strand": strands[6], "start": mappings[6], "end": mappings[6], - "assembly": "{}".format(source), + "assembly": f"{source}", }, "mapped": { "seq_region_name": "3", "strand": strands[7], "start": mappings[7], "end": mappings[7], - "assembly": "{}".format(target), + "assembly": f"{target}", }, } ] @@ -577,7 +577,7 @@ def run_parsing_tests( with tempfile.TemporaryDirectory() as tmpdir: base = os.path.basename(file) - dest = os.path.join(tmpdir, "{}.gz".format(base)) + dest = os.path.join(tmpdir, f"{base}.gz") gzip_file(file, dest) self.make_parsing_assertions( self.parse_file(dest), source, phased, build, build_detected, snps_df @@ -596,7 +596,7 @@ def run_parsing_tests( snps_df, ) - dest = os.path.join(tmpdir, "{}.zip".format(base)) + dest = os.path.join(tmpdir, f"{base}.zip") zip_file(file, dest, base) self.make_parsing_assertions( self.parse_file(dest), source, phased, build, build_detected, snps_df @@ -652,7 +652,7 @@ def run_parsing_tests_vcf( with tempfile.TemporaryDirectory() as tmpdir: base = os.path.basename(file) - dest = os.path.join(tmpdir, "{}.gz".format(base)) + dest = os.path.join(tmpdir, f"{base}.gz") gzip_file(file, dest) self.make_parsing_assertions_vcf( self.parse_file(dest, rsids), diff --git a/tests/io/test_reader.py b/tests/io/test_reader.py index 983fc38c..bf93f660 100644 --- a/tests/io/test_reader.py +++ b/tests/io/test_reader.py @@ -129,10 +129,10 @@ def test_read_ancestry_extra_tab(self): s += "rs1\t1\t101\t\tA\tA\r\n" # generate remainder of lines for i in range(1, total_snps): - s += "rs{}\t1\t{}\tA\tA\r\n".format(1 + i, 101 + i) + s += f"rs{1 + i}\t1\t{101 + i}\tA\tA\r\n" snps_df = self.create_snp_df( - rsid=["rs{}".format(1 + i) for i in range(0, total_snps)], + rsid=[f"rs{1 + i}" for i in range(0, total_snps)], chrom="1", pos=[101 + i for i in range(0, total_snps)], genotype="AA", @@ -179,17 +179,14 @@ def test_read_ftdna_concat_gzip_extra_data(self): # generate content of first file s1 = "RSID,CHROMOSOME,POSITION,RESULT\r\n" for i in range(0, total_snps1): - s1 += '"rs{}","1","{}","AA"\r\n'.format(1 + i, 101 + i) + s1 += f'"rs{1 + i}","1","{101 + i}","AA"\r\n' # generate content of second file s2 = "RSID,CHROMOSOME,POSITION,RESULT\r\n" for i in range(0, total_snps2): - s2 += '"rs{}","1","{}","AA"\r\n'.format( - total_snps1 + 1 + i, total_snps1 + 101 + i - ) - + s2 += f'"rs{total_snps1 + 1 + i}","1","{ total_snps1 + 101 + i}","AA"\r\n' snps_df = self.create_snp_df( - rsid=["rs{}".format(1 + i) for i in range(0, total_snps1 + total_snps2)], + rsid=[f"rs{1 + i}" for i in range(0, total_snps1 + total_snps2)], chrom="1", pos=[101 + i for i in range(0, total_snps1 + total_snps2)], genotype="AA", @@ -197,9 +194,9 @@ def test_read_ftdna_concat_gzip_extra_data(self): with tempfile.TemporaryDirectory() as tmpdir: file1 = os.path.join(tmpdir, "ftdna_concat_gzip1.csv") - file1_gz = "{}.gz".format(file1) + file1_gz = f"{file1}.gz" file2 = os.path.join(tmpdir, "ftdna_concat_gzip2.csv") - file2_gz = "{}.gz".format(file2) + file2_gz = f"{file2}.gz" path = os.path.join(tmpdir, "ftdna_concat_gzip.csv.gz") # write individual files @@ -244,17 +241,15 @@ def test_read_ftdna_second_header(self): s = "RSID,CHROMOSOME,POSITION,RESULT\n" # generate first chunk of lines for i in range(0, total_snps1): - s += '"rs{}","1","{}","AA"\n'.format(1 + i, 101 + i) + s += f'"rs{1 + i}","1","{101 + i}","AA"\n' # add second header s += "RSID,CHROMOSOME,POSITION,RESULT\n" # generate second chunk of lines for i in range(0, total_snps2): - s += '"rs{}","1","{}","AA"\n'.format( - total_snps1 + 1 + i, total_snps1 + 101 + i - ) + s += f'"rs{total_snps1 + 1 + i}","1","{total_snps1 + 101 + i}","AA"\n' snps_df = self.create_snp_df( - rsid=["rs{}".format(1 + i) for i in range(0, total_snps1 + total_snps2)], + rsid=[f"rs{1 + i}" for i in range(0, total_snps1 + total_snps2)], chrom="1", pos=[101 + i for i in range(0, total_snps1 + total_snps2)], genotype="AA", diff --git a/tests/test_resources.py b/tests/test_resources.py index a20a5abd..7b255145 100644 --- a/tests/test_resources.py +++ b/tests/test_resources.py @@ -109,14 +109,14 @@ def f(): def _generate_test_gsa_resources(self): s = "Name\tRsID\n" for i in range(1, 618541): - s += "rs{}\trs{}\n".format(i, i) + s += f"rs{i}\trs{i}\n" mock = mock_open(read_data=gzip.compress(s.encode())) with patch("urllib.request.urlopen", mock): self.resource._get_path_gsa_rsid_map() s = "Name\tChr\tMapInfo\tdeCODE(cM)\n" for i in range(1, 665609): - s += "rs{}\t1\t{}\t0.0000\n".format(i, i) + s += f"rs{i}\t1\t{i}\t0.0000\n" mock = mock_open(read_data=gzip.compress(s.encode())) with patch("urllib.request.urlopen", mock): @@ -167,7 +167,7 @@ def run_reference_sequences_test(self, f, assembly="GRCh37"): if self.downloads_enabled: f() else: - s = ">MT dna:chromosome chromosome:{}:MT:1:16569:1 REF\n".format(assembly) + s = f">MT dna:chromosome chromosome:{assembly}:MT:1:16569:1 REF\n" for i in range(276): s += "A" * 60 s += "\n" @@ -194,27 +194,20 @@ def f(): self.assertEqual(len(seqs), 1) self.assertEqual( seqs["MT"].__repr__(), - "ReferenceSequence(assembly='{}', ID='MT')".format(assembly_expect), + f"ReferenceSequence(assembly='{assembly_expect}', ID='MT')", ) self.assertEqual(seqs["MT"].ID, "MT") self.assertEqual(seqs["MT"].chrom, "MT") - self.assertEqual(seqs["MT"].url, "{}".format(url_expect)) + self.assertEqual(seqs["MT"].url, f"{url_expect}") self.assertEqual( seqs["MT"].path, os.path.relpath( - "{}".format( - os.path.join( - self.resource._resources_dir, - "fasta", - assembly_expect, - os.path.basename(url_expect), - ) - ) + f'{os.path.join(self.resource._resources_dir,"fasta", assembly_expect,os.path.basename(url_expect))}' ), ) self.assertTrue(os.path.exists(seqs["MT"].path)) self.assertEqual(seqs["MT"].assembly, assembly_expect) - self.assertEqual(seqs["MT"].build, "B{}".format(assembly_expect[-2:])) + self.assertEqual(seqs["MT"].build, f"B{assembly_expect[-2:]}") self.assertEqual(seqs["MT"].species, "Homo sapiens") self.assertEqual(seqs["MT"].taxonomy, "x") @@ -286,14 +279,7 @@ def f(): self.assertEqual( seqs["MT"].path, os.path.relpath( - "{}".format( - os.path.join( - self.resource._resources_dir, - "fasta", - "GRCh37", - "Homo_sapiens.GRCh37.dna.chromosome.MT.fa.gz", - ) - ) + f'{os.path.join(self.resource._resources_dir,"fasta", "GRCh37","Homo_sapiens.GRCh37.dna.chromosome.MT.fa.gz")}' ), ) self.assertTrue(os.path.exists(seqs["MT"].path))