Skip to content

Commit

Permalink
Remove redundant GENCODE IDs
Browse files Browse the repository at this point in the history
GENCODE IDs are currently only unique when combining the protein (ENSP) and transcript (ENST) IDs. Since there are so few duplicates, we will remove them rather than concatenating these IDs in the "protein_id" column of the output of make_results_ratio_* and make_rii_peptide_* functions.
  • Loading branch information
TylerSagendorf committed Jan 10, 2023
1 parent cc927db commit 203a8ef
Show file tree
Hide file tree
Showing 3 changed files with 41 additions and 73 deletions.
2 changes: 2 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -72,13 +72,15 @@ importFrom(dplyr,inner_join)
importFrom(dplyr,left_join)
importFrom(dplyr,mutate)
importFrom(dplyr,n)
importFrom(dplyr,pull)
importFrom(dplyr,rename)
importFrom(dplyr,select)
importFrom(dplyr,starts_with)
importFrom(dplyr,summarise)
importFrom(dplyr,summarize)
importFrom(dplyr,ungroup)
importFrom(plyr,llply)
importFrom(purrr,map)
importFrom(purrr,reduce)
importFrom(readr,read_tsv)
importFrom(tibble,rownames_to_column)
Expand Down
83 changes: 20 additions & 63 deletions R/motrpac_bic_funtions.R
Original file line number Diff line number Diff line change
Expand Up @@ -133,11 +133,6 @@ make_rii_peptide_gl <- function(msnid,
aggregation_level <- c("accession", "peptide")
annotation <- toupper(annotation)

if(annotation == "GENCODE"){
psms(msnid) <- psms(msnid) %>%
mutate(accession = sub("(ENSP[^\\|]+\\|ENST[^\\|]+).*",
"\\1", accession))
}
crosstab <- create_crosstab(msnid,
masic_data,
aggregation_level,
Expand All @@ -146,7 +141,6 @@ make_rii_peptide_gl <- function(msnid,
as.data.frame() %>%
rownames_to_column("Specie")


## Fetch conversion table
from <- annotation
to <- c("SYMBOL", "ENTREZID")
Expand All @@ -163,7 +157,7 @@ make_rii_peptide_gl <- function(msnid,
from <- "SYMBOL"
to <- "ENTREZID"
} else if (annotation == "GENCODE") {
rgx <- "(ENSP[^\\|]+\\|ENST[^\\|]+).*"
rgx <- "(ENSP[^\\|]+).*"
grp <- "\\1"
fasta_names <- parse_FASTA_names(fasta_file, "gencode") %>%
dplyr::rename(SYMBOL = gene)
Expand All @@ -175,14 +169,6 @@ make_rii_peptide_gl <- function(msnid,
fetch_conversion_table(org_name, from = from, to = to)
)

# Add ENTREZID column to parse_FASTA_names results
if (annotation == "GENCODE") {
tab <- left_join(fasta_names, conv, by = "SYMBOL") %>%
mutate(protein_id = paste(protein_id, transcript_id, sep = "|"))
} else if (annotation == "UNIPROT") {
tab <- left_join(fasta_names, conv, by = "SYMBOL")
}

# Feature data
feature_data <- crosstab %>%
dplyr::select(Specie) %>%
Expand All @@ -196,7 +182,9 @@ make_rii_peptide_gl <- function(msnid,
left_join(conv, by = c("ANNOTATION" = annotation)) %>%
dplyr::select(-ANNOTATION)
} else if (annotation %in% c("GENCODE", "UNIPROT")) {
feature_data <- left_join(feature_data, tab, by = "protein_id")
feature_data <- left_join(feature_data, fasta_names,
by = "protein_id") %>%
left_join(conv, by = "SYMBOL")
}

feature_data <- dplyr::rename(feature_data,
Expand Down Expand Up @@ -232,8 +220,6 @@ utils::globalVariables(
)




#' @export
#' @rdname motrpac_bic_output
make_results_ratio_gl <- function(msnid,
Expand All @@ -250,10 +236,6 @@ make_results_ratio_gl <- function(msnid,
aggregation_level <- c("accession")
annotation <- toupper(annotation)

if (annotation == "GENCODE") {
msnid$accession <- sub("(ENSP[^\\|]+\\|ENST[^\\|]+).*", "\\1", msnid$accession)
}

crosstab <- create_crosstab(msnid, masic_data,
aggregation_level,
fractions, samples, references) %>%
Expand All @@ -275,7 +257,7 @@ make_results_ratio_gl <- function(msnid,
from <- "SYMBOL"
to <- "ENTREZID"
} else if (annotation == "GENCODE") {
rgx <- "(ENSP[^\\|]+\\|ENST[^\\|]+).*"
rgx <- "(ENSP[^\\|]+).*"
grp <- "\\1"
fasta_names <- parse_FASTA_names(fasta_file, "gencode") %>%
dplyr::rename(SYMBOL = gene)
Expand All @@ -287,13 +269,6 @@ make_results_ratio_gl <- function(msnid,
fetch_conversion_table(org_name, from = from, to = to)
)

if (annotation == "GENCODE") {
tab <- left_join(fasta_names, conv, by = "SYMBOL") %>%
mutate(protein_id = paste(protein_id, transcript_id, sep = "|"))
} else if (annotation == "UNIPROT") {
tab <- left_join(fasta_names, conv, by = "SYMBOL")
}

# Create Feature data
feature_data <- crosstab %>%
dplyr::select(protein_id) %>%
Expand All @@ -304,15 +279,16 @@ make_results_ratio_gl <- function(msnid,
mutate(ANNOTATION = sub(rgx, grp, protein_id)) %>%
left_join(conv, by = c("ANNOTATION" = annotation)) %>%
select(-ANNOTATION)
} else if (annotation %in% c("UNIPROT", "GENCODE")) {
feature_data <- left_join(feature_data, tab, by = "protein_id")
} else if (annotation %in% c("GENCODE", "UNIPROT")) {
feature_data <- left_join(feature_data, fasta_names,
by = "protein_id") %>%
left_join(conv, by = "SYMBOL")
}

feature_data <- dplyr::rename(feature_data,
gene_symbol = SYMBOL,
entrez_id = ENTREZID)


## Additional info from MS/MS -------------------------------------------
ids <- psms(msnid) %>%
dplyr::select(accession, peptide,
Expand Down Expand Up @@ -340,7 +316,6 @@ utils::globalVariables(c("noninferableProteins", "percentAACoverage",
"percent_coverage", "feature", "transcript_id"))



#' @export
#' @rdname motrpac_bic_output
make_rii_peptide_ph <- function(msnid,
Expand Down Expand Up @@ -368,9 +343,6 @@ make_rii_peptide_ph <- function(msnid,
## Create Crosstab
annotation <- toupper(annotation)

if (annotation == "GENCODE") {
msnid$accession = sub("(ENSP[^\\|]+\\|ENST[^\\|]+).*", "\\1", msnid$accession)
}
aggregation_level <- c("accession", "peptide", "SiteID")
crosstab <- create_crosstab(msnid,
masic_data,
Expand All @@ -396,7 +368,7 @@ make_rii_peptide_ph <- function(msnid,
from <- "SYMBOL"
to <- "ENTREZID"
} else if (annotation == "GENCODE") {
rgx <- "(ENSP[^\\|]+\\|ENST[^\\|]+).*"
rgx <- "(ENSP[^\\|]+).*"
grp <- "\\1"
fasta_names <- parse_FASTA_names(fasta_file, "gencode") %>%
dplyr::rename(SYMBOL = gene)
Expand All @@ -408,13 +380,6 @@ make_rii_peptide_ph <- function(msnid,
fetch_conversion_table(org_name, from = from, to = to)
)

if (annotation == "GENCODE"){
tab <- left_join(fasta_names, conv, by = "SYMBOL") %>%
mutate(protein_id = paste(protein_id, transcript_id, sep = "|"))
} else if (annotation == "UNIPROT") {
tab <- left_join(fasta_names, conv, by = "SYMBOL")
}

## Create RII peptide table
feature_data <- crosstab %>%
dplyr::select(Specie) %>%
Expand All @@ -429,7 +394,9 @@ make_rii_peptide_ph <- function(msnid,
left_join(conv, by = c("ANNOTATION" = annotation)) %>%
dplyr::select(-ANNOTATION)
} else if (annotation %in% c("GENCODE", "UNIPROT")) {
feature_data <- left_join(feature_data, tab, by = "protein_id")
feature_data <- left_join(feature_data, fasta_names,
by = "protein_id") %>%
left_join(conv, by = "SYMBOL")
}

feature_data <- dplyr::rename(feature_data,
Expand Down Expand Up @@ -474,8 +441,6 @@ utils::globalVariables(
)




#' @export
#' @rdname motrpac_bic_output
make_results_ratio_ph <- function(msnid,
Expand All @@ -487,8 +452,9 @@ make_results_ratio_ph <- function(msnid,
fasta_file)
{
aggregation_level <- c("accession", "SiteID")
crosstab <- create_crosstab(msnid, masic_data, aggregation_level, fractions,
samples, references)
crosstab <- create_crosstab(msnid, masic_data,
aggregation_level,
fractions, samples, references)
crosstab <- as.data.frame(crosstab) %>%
rownames_to_column("Specie")

Expand All @@ -509,7 +475,7 @@ make_results_ratio_ph <- function(msnid,
from <- "SYMBOL"
to <- "ENTREZID"
} else if (annotation == "GENCODE") {
rgx <- "(ENSP[^\\|]+\\|ENST[^\\|]+).*"
rgx <- "(ENSP[^\\|]+).*"
grp <- "\\1"
fasta_names <- parse_FASTA_names(fasta_file, "gencode") %>%
dplyr::rename(SYMBOL = gene)
Expand All @@ -521,13 +487,6 @@ make_results_ratio_ph <- function(msnid,
fetch_conversion_table(org_name, from = from, to = to)
)

if (annotation == "GENCODE") {
tab <- left_join(fasta_names, conv, by = "SYMBOL") %>%
mutate(protein_id = paste(protein_id, transcript_id, sep = "|"))
} else if (annotation == "UNIPROT") {
tab <- left_join(fasta_names, conv, by = "SYMBOL")
}

## Create RII peptide table
feature_data <- crosstab %>%
select(Specie) %>%
Expand All @@ -541,7 +500,9 @@ make_results_ratio_ph <- function(msnid,
left_join(conv, by = c("ANNOTATION" = annotation)) %>%
select(-ANNOTATION)
} else if (annotation %in% c("GENCODE", "UNIPROT")) {
feature_data <- left_join(feature_data, tab, by = "protein_id")
feature_data <- left_join(feature_data, fasta_names,
by = "protein_id") %>%
left_join(conv, by = "SYMBOL")
}

feature_data <- dplyr::rename(feature_data,
Expand Down Expand Up @@ -585,8 +546,6 @@ utils::globalVariables(
)




#' @export
#' @rdname motrpac_bic_output
assess_redundant_protein_matches <- function(msnid, collapse="|") {
Expand All @@ -602,8 +561,6 @@ assess_redundant_protein_matches <- function(msnid, collapse="|") {
}




#' @export
#' @rdname motrpac_bic_output
assess_noninferable_proteins <- function(msnid, collapse="|") {
Expand Down
29 changes: 19 additions & 10 deletions R/run_plexedpiper.R
Original file line number Diff line number Diff line change
Expand Up @@ -52,10 +52,10 @@
#' @importFrom MSnID psms MSnID compute_accession_coverage
#' correct_peak_selection extract_sequence_window
#' infer_parsimonious_accessions map_mod_sites
#' @importFrom dplyr %>% full_join select
#' @importFrom dplyr %>% full_join select mutate filter pull
#' @importFrom tidyselect where
#' @importFrom data.table rbindlist
#' @importFrom purrr reduce
#' @importFrom purrr reduce map
#'
#' @examples \dontrun{
#' # Example with pseudo-paths
Expand Down Expand Up @@ -126,22 +126,20 @@ run_plexedpiper <- function(msgf_output_folder,

suppressMessages(msnid <- MSnID())

psms(msnid) <- lapply(msgf_output_folder, function(msgf_folder_i) {
psms(msnid) <- map(msgf_output_folder, function(msgf_folder_i) {
out <- read_msgf_data(msgf_folder_i)
out <- psms(out)
return(out)
}) %>%
rbindlist(fill = TRUE)

if (!is.null(ascore_output_folder)) {
ascore <- lapply(ascore_output_folder, function(ascore_folder_i) {
read_AScore_results(ascore_folder_i)
}) %>%
ascore <- map(ascore_output_folder, read_AScore_results) %>%
rbindlist(fill = TRUE)
}

if (verbose) {message("- Filtering MASIC results.")}
masic_data <- lapply(masic_output_folder, function(masic_folder) {
masic_data <- map(masic_output_folder, function(masic_folder) {
read_masic_data(masic_folder, interference_score = TRUE) %>%
filter_masic_data(0.5, 0)
})
Expand Down Expand Up @@ -184,8 +182,19 @@ run_plexedpiper <- function(msgf_output_folder,
msnid <- apply_filter(msnid, "!isDecoy")

if (annotation == "GENCODE") {
msnid$accession <- sub("(ENSP[^\\|]+\\|ENST[^\\|]+).*", "\\1", msnid$accession)
names(fst) <- sub("(ENSP[^\\|]+\\|ENST[^\\|]+).*", "\\1", names(fst))
# Remove duplicate GENCODE protein IDs using FASTA file headers
pttrn <- "(ENSP[^\\|]+).*"
unique_ids <- data.frame(id_orig = names(fst)) %>%
mutate(id_new = sub(pttrn, "\\1", id_orig),
duped = duplicated(id_new)) %>%
filter(!duped) %>%
pull(id_orig)
msnid <- apply_filter(msnid, "accession %in% unique_ids")
msnid$accession <- sub(pttrn, "\\1", msnid$accession)
fst <- fst[names(fst) %in% unique_ids]
names(fst) <- sub(pttrn, "\\1", names(fst))

# Sanity check
if (anyDuplicated(names(fst)) != 0) {
stop("Duplicate FASTA entry names!")
}
Expand Down Expand Up @@ -228,7 +237,7 @@ run_plexedpiper <- function(msgf_output_folder,
stop("Proteomics variable not supported.")
}

msnid <- map_mod_sites(msnid,
msnid <- map_mod_sites(object = msnid,
fasta = fst,
accession_col = "accession",
peptide_mod_col = "peptide",
Expand Down

0 comments on commit 203a8ef

Please sign in to comment.