JRaviLab · Cateline · Oct 11, 2024 · Oct 11, 2024 · Oct 13, 2024 · Oct 13, 2024
diff --git a/CARD_data/CARD-Download-README.txt b/CARD_data/CARD-Download-README.txt
@@ -0,0 +1,70 @@
+CARD Download README
+
+Use or reproduction of these materials, in whole or in part, by any commercial 
+organization whether or not for non-commercial (including research) or commercial purposes
+is prohibited, except with written permission of McMaster University. Commercial uses are
+offered only pursuant to a written license and user fee. To obtain permission and begin 
+the licensing process, see http://card.mcmaster.ca/about.
+
+CITATION:
+
+Alcock et al. 2023. "CARD 2023: expanded curation, support for machine learning, and resistome 
+prediction at the Comprehensive Antibiotic Resistance Database" Nucleic Acids Research, 
+51, D690-D699. https://pubmed.ncbi.nlm.nih.gov/36263822/
+
+CARD SHORT NAMES:
+
+A CARD-specific abbreviation for AMR gene names associated with Antibiotic Resistance
+Ontology terms, often not based on the literature. This is used for programmatic and 
+compatibility purposes and is not ontologically relevant. Each ontology term with an 
+associated AMR detection model has a CARD Short Name that appears in CARD data files 
+and output generated by RGI. If the original gene name is less than 15 characters, the 
+CARD short name is identical; if the gene name is greater than 15 characters, the CARD 
+Short Name has been abbreviated by CARD curators specifically to identify the proper 
+gene or protein name. All CARD Short Names are unique and have whitespace characters 
+replaced by underscore characters. The convention for pathogen names is capitalized 
+first letter of the genus followed by the lowercase first three letters of the species 
+name. The antibiotic abbreviations are from https://journals.asm.org/journal/aac/abbreviations
+plus some custom abbreviations by the CARD curators. Simple CARD Short Names often do not
+involve either, e.g. CTX-M-15, but where applicable the CARD Short Names follow pathogen_gene
+or pathogen_gene_drug. The full lists of abbreviations can be found in the enclosed files: 
+
+"shortname_antibiotics.tsv"
+"shortname_pathogens.tsv"
+
+FASTA:
+
+Nucleotide and corresponding protein FASTA downloads are available as separate files for 
+each model type.  For example, the "protein homolog" model type contains sequences of
+antimicrobial resistance genes that do not include mutation as a determinant of resistance
+- these data are appropriate for BLAST analysis of metagenomic data or searches excluding 
+secondary screening for resistance mutations. In contrast, the "protein variant" model 
+includes reference wild type sequences used for mapping SNPs conferring antimicrobial 
+resistance - without secondary mutation screening, analyses using these data will include 
+false positives for antibiotic resistant gene variants or mutants.
+
+MODELS:
+
+The file "card.json" contains the complete data for all of CARD's AMR detection models, 
+including reference sequences, SNP mapping data, model parameters, and ARO classification.
+"card.json" is used by the Resistance Gene Identifier software. 
+
+Values for "High Confidence TB", "Moderate Confidence TB", "Minimal Confidence TB", and
+"Indeterminate Confidence TB" were obtained from https://platform.reseqtb.org.
+
+INDEX FILES:
+
+The file "aro_index.tsv" contains a list of ARO tagging of GenBank accessions stored in 
+CARD.
+
+The file "aro_categories.tsv" contains a list of ARO terms used to categorize all entries
+in CARD and results via the RGI. These categories reflect AMR gene family, target drug 
+class, and mechanism of resistance.
+
+The file "aro_categories_index.tsv" contains a list a GenBank accessions stored 
+in CARD cross-referenced with the major categories within the ARO. These categories 
+reflect AMR gene family, target drug class, and mechanism of resistance, so GenBank 
+accessions may have more than one cross-reference. For more complex categorization of 
+the data, use the full ARO available at http://card.mcmaster.ca/download.
+
+The file "snps.txt" lists the SNPs associated with specific detection models.
diff --git a/CARD_data/aro_categories_index.tsv b/CARD_data/aro_categories_index.tsv
diff --git a/CARD_data/aro_index.tsv b/CARD_data/aro_index.tsv
diff --git a/CARD_data/shortname_antibiotics.tsv b/CARD_data/shortname_antibiotics.tsv
@@ -0,0 +1,76 @@
+AAC Abbreviation	Molecule
+AMG	Aminoglycosides
+AMK	Amikacin
+AMU	Aminocoumarin
+AMX	Amoxicillin
+ATM	Aztreonam
+AVI	Avibactam
+AZM	Azithromycin
+BDQ	Bedaquiline
+BLA	Beta-lactams
+CAP	Capreomycin
+CEF	Ceftazidime
+CZA	Ceftazidime-Avibactam
+CHL	Chloramphenicol
+CIP	Ciprofloxacin
+CLI	Clindamycin
+CLR	Clarithromycin
+CST	Colistin
+DAO	Dapsone
+DAP	Daptomycin
+DCS	D-cycloserine
+EDN	Edeine
+ELF	Elfamycin
+EMB	Ethambutol
+EMCM	Ethambutol & Capreomycin
+ENC	Enacyloxin IIa
+ENR	Enrofloxacin
+ERY	Erythromycin
+ETO	Ethionamide
+FA	Fusidic acid
+FLO	Fluoroquinolones
+FOF	Fosfomycin
+G418	G418
+GE2A	GE2270A
+GEN	Gentamicin
+GENC	Gentamicin C
+HGM	Hyrgomycin B
+INH	Isoniazid
+IPM	Imipenem
+KAN	Kanamycin
+KAS	Kasugamicin
+KIR	Kirromycin
+LEV	Levofloxacin
+LYS	Lysocin (E)
+LZD	Linezolid
+MAC	Macrolides
+MULT	Multiple antibiotics
+MUP	Mupirocin
+MTZ	Metronidazole
+MXF	Moxifloxacin
+NEO	Neomycin
+NIT	Nitrofurantoin
+OFX	Ofloxacin
+OXZ	Oxazolidinone
+PAC	Pactamycin
+PAR	Paromomycin
+PAS	Para-aminosalicylic acid
+PCL	Perchlozone
+PLM	Pleuromutilin
+PLV	Pulvomycin
+PTO	Prothionamide
+PZA	Pyrazinamide
+RFB	Rifabutin
+RIF	Rifampicin
+SLF	Sulfonamides
+SPT	Spectinomycin
+STR	Streptomycin
+TMP	Trimethoprim
+TET	Tetracycline
+TOB	Tobramycin
+TRC	Triclosan
+TYL	Tylosin
+VAN	Vancomycin
+VIO	Viomycin
+ZOL	Zoliflodacin
+CAP	capreomycin
diff --git a/CARD_data/shortname_pathogens.tsv b/CARD_data/shortname_pathogens.tsv
@@ -0,0 +1,94 @@
+Abbreviation	Pathogen
+Abau	Acinetobacter baumannii
+Acla	Alkalihalobacillus clausii
+Afab	Agrobacterium fabrum
+Bado	Bifidobacterium adolescentis
+Bbac	Bartonella bacilliformis
+Bbif	Bifidobacterium bifidum
+Bbur	Borreliella burgdorferi
+Bdol	Burkholderia dolosa
+Bhyo	Brachyspira hyodysenteriae
+Bpse	Burkholderia pseudomallei
+Bpum	Bacillus pumilus
+Bsub	Bacillus subtilis
+Bsui	Brucella suis
+Ccol	Campylobacter coli
+Cacn	Cutibacterium acnes
+Cbut	Clostridium butyricum
+Cspo	Clostridium sporogenes
+Cdif	Clostridioides difficile
+Cgin	Capnocytophaga gingivalis
+Cjej	Campylobacter jejuni
+Cmen	Chryseobacterium meningosepticum
+Cper	Clostridium perfringens
+Cpsi	Chlamydophila psittaci
+Crei	Chlamydomonas reinhardtii
+Cstr	Corynebacterium striatum
+Ctra	Chlamydia trachomatis
+Eclo	Enterobacter cloacae
+Ecol	Escherichia coli
+Efac	Enterococcus faecium
+Efae	Enterococcus faecalis
+Erhu	Erysipelothrix rhusiopathiae
+Hhal	Halobacterium halobium
+Hinf	Haemophilus influenzae
+Hpin	Haemophilus parainfluenzae
+Hpyl	Helicobacter pylori
+Hsal	Halobacterium salinarum
+Kaer	Klebsiella aerogenes
+Kleb	Klebsiella
+Kpne	Klebsiella pneumoniae
+Lhon	Laribacter hongkongensis
+Lmon	Listeria monocytogenes
+Lreu	Limosilactobacillus reuteri
+Mabs	Mycobacteroides abscessus
+Mavi	Mycobacterium avium
+Mbov	Mycobacterium tuberculosis variant bovis
+Mcat	Moraxella catarrhalis
+Mche	Mycobacteroides chelonae
+Mfer	Mycoplasmopsis fermentans
+Mgal	Mycoplasma gallisepticum
+Mgen	Mycoplasma genitalium
+Mhom	Mycoplasma hominis
+Mint	Mycobacterium intracellulare
+Mkan	Mycobacterium kansasii
+Mlep	Mycobacterium leprae
+Mmor	Morganella morganii
+Mpne	Mycoplasma pneumoniae
+Msme	Mycolicibacterium smegmatis
+Mtub	Mycobacterium tuberculosis
+Ngon	Neisseria gonorrhoeae
+Nmen	Neisseria meningitidis
+Nvir	Neobacillus vireti
+Nfar	Nocardia farcinica
+Paer	Pseudomonas aeruginosa
+Pmir	Proteus mirabilis
+Pmul	Pasteurella multocida
+Prop	Propionibacteria
+Pros	Planobispora rosea
+Rfas	Rhodococcus fascians
+Rsph	Rhodobacter sphaeroides
+Saga	Streptococcus agalactiae
+Samb	Streptomyces ambofaciens
+Saur	Staphylococcus aureus
+Scin	Streptomyces cinnamoneus
+Scoh	Staphylococcus cohnii
+Sent	Salmonella enterica
+Sfle	Shigella flexneri
+Sfra	Streptomyces fradiae
+Sven	Streptomyces venezuelae
+Sint	Staphylococcus intermedius
+Sliv	Streptomyces lividans
+Smar	Serratia marcescens
+Smit	Streptococcus mitis
+Spne	Streptococcus pneumoniae
+Spyo	Streptococcus pyogenes
+Sris	Streptomyces rishiriensis
+Sser	Salmonella serovars
+Ssui	Streptococcus suis
+Tthe	Thermus thermophilus
+Uure	Ureaplasma urealyticum
+Vcho	Vibrio cholerae
+Vang	Vibrio anguillarum
+Vvul	Vibrio vulnificus
+Yent	Yersinia enterocolitica
diff --git a/Klebsiella CARD Filtering Code.R b/Klebsiella CARD Filtering Code.R
@@ -0,0 +1,103 @@
+# Specify the URL and the destination path where the file will be saved
+url <- "https://card.mcmaster.ca/download/0/broadstreet-v3.3.0.tar.bz2"
+destfile <- "broadstreet-v3.3.0.tar.bz2"
+
+# Download the file
+download.file(url, destfile)
+
+#Extract the file
+install.packages("R.utils")
+library(R.utils)
+
+# Decompress the file
+bunzip2("broadstreet-v3.3.0.tar.bz2", destname = "broadstreet-v3.3.0.tar", remove = FALSE)
+
+# Extract the tar file
+untar("broadstreet-v3.3.0.tar", exdir = "CARD_data")
+
+# List the contents of the extraction directory
+list.files("CARD_data")
+
+# Parse the ARO_index.tsv file using read.delim
+aro_index <- read.delim("CARD_data/ARO_index.tsv", header = TRUE, sep = "\t")
+
+# View the first few rows of the data
+head(aro_index)
+
+
+# Map CARD Short Name
+# Load necessary library
+library(dplyr)
+
+#  Read the files
+aro_index <- read.delim("CARD_data/aro_index.tsv", sep = "\t", header = TRUE)
+antibiotics_data <- read.delim("CARD_data/shortname_antibiotics.tsv", sep = "\t", header = TRUE)
+pathogens_data <- read.delim("CARD_data/shortname_pathogens.tsv", sep = "\t", header = TRUE)
+
+# View the first few rows to ensure data is loaded correctly
+head(aro_index)
+head(antibiotics_data)
+head(pathogens_data)
+
+# Split CARD.Short.Name into pathogen, gene, and drug
+aro_index <- aro_index %>%
+  mutate(
+    pathogen = sapply(strsplit(CARD.Short.Name, "_"), `[`, 1),   # First part: Pathogen
+    gene = sapply(strsplit(CARD.Short.Name, "_"), `[`, 2),       # Second part: Gene
+    drug = ifelse(sapply(strsplit(CARD.Short.Name, "_"), length) == 3,   # Third part (if present): Drug
+                  sapply(strsplit(CARD.Short.Name, "_"), `[`, 3), NA)
+  )
+
+# View the mutated data
+head(aro_index)
+
+
+# Print the first few rows of each dataframe
+print(head(aro_index))
+print(head(antibiotics_data))
+print(head(pathogens_data))
+
+# Show the column names of each dataframe
+print(colnames(aro_index))
+print(colnames(antibiotics_data))
+print(colnames(pathogens_data))
+
+#Extract pathogen, gene, and drug from 'CARD.Short.Name'
+aro_index_clean <- aro_index %>%
+  mutate(
+    pathogen = sapply(strsplit(CARD.Short.Name, "_"), `[`, 1),  # Extract pathogen
+    gene = sapply(strsplit(CARD.Short.Name, "_"), `[`, 2),      # Extract gene
+    drug = ifelse(sapply(strsplit(CARD.Short.Name, "_"), length) == 3,   # If present, extract drug
+                  sapply(strsplit(CARD.Short.Name, "_"), `[`, 3), NA)
+  )
+
+#Merge aro_index_clean with the antibiotics_data and pathogens_data
+# For merging with antibiotics_data
+merged_data_antibiotics <- left_join(aro_index_clean, antibiotics_data,
+                                     by = c("gene" = "AAC.Abbreviation"))
+
+# For merging with pathogens_data
+merged_data_pathogens <- left_join(merged_data_antibiotics, pathogens_data,
+                                   by = c("pathogen" = "Abbreviation"))
+
+#Remove duplicate rows and filter out rows where pathogen is empty
+cleaned_data <- merged_data_pathogens %>%
+  distinct() %>%
+  filter(!is.na(pathogen))
+
+#Group by Pathogen, Gene, and Drug, then summarize Antibiotic information
+summarized_data <- cleaned_data %>%
+  group_by(Pathogen = Pathogen, Gene = gene, Drug = drug) %>%
+  summarize(Antibiotic_Info = paste(unique(Molecule), collapse = ", ")) %>%
+  arrange(Pathogen, Gene, Drug)
+
+#View the final summarized data
+print(head(summarized_data))
+
+# Filter for Klebsiella pneumoniae and CZA(Bug-Drug of Interest)
+summarized_data %>%
+  filter(Pathogen == "Klebsiella pneumoniae", Drug == "CZA") -> klebsiella_cza_combinations
+
+# View the filtered results
+head(klebsiella_cza_combinations)
+
diff --git a/MolEvolvR.Rproj b/MolEvolvR.Rproj
@@ -20,3 +20,5 @@ BuildType: Package
 PackageUseDevtools: Yes
 PackageInstallArgs: --no-multiarch --with-keep.source
 PackageRoxygenize: rd,collate,namespace
+
+ProjectName: Process CARD Data