-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathDECIPHER_DetectRepeats.7.r
79 lines (73 loc) · 2.88 KB
/
DECIPHER_DetectRepeats.7.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
# Usage:
# Rscript DECIPHER_DetectRepeats.7.r euk_1775_fail.rds 1
ARGS <- commandArgs(trailingOnly = TRUE)
in_dataframe <- ARGS[1] # in_dataframe
row_index <- as.integer(ARGS[2]) # row_index
print(ARGS)
print(in_dataframe)
print(row_index)
# load libraries
suppressMessages(library(DECIPHER))
detectCores()
packageVersion("DECIPHER")
options(timeout=999999999) # default is 60 sec, will fail if the genome is too big
######################## main ########################
# read data frame that contains FTP address and genome names
genomes_df <- readRDS(in_dataframe)
genomeIDs <- rownames(genomes_df)
genomeID_h <- genomeIDs[row_index]
out_filename <- paste(genomeID_h, '.rds', sep='')
cat('\n', row_index, genomeID_h, '\n')
fas.ftp <- genomes_df[genomeID_h, 'RefSeq.FTP']
if (fas.ftp == "") {
fas.ftp <- genomes_df[genomeID_h, 'GenBank.FTP']
}
if (fas.ftp == "") {
cat('\n', row_index, genomeID_h, 'failed to get the FTP address.\n')
} else {
# try downloading the cds sequence until succeed (at most 20 attempts)
attempt <- 1
fas.url <- paste(fas.ftp, "/", strsplit(fas.ftp, split = "/", fixed = TRUE)[[1]][10], "_protein.faa.gz", sep = "")
cat("\nDownloading AA fasta file from: ", fas.url, '\n')
attempt <- 1 # try downloading for at most 20 times
while (!file.exists('protein.faa.gz') && attempt <= 5) {
attempt <- attempt + 1
try(
download.file(fas.url, 'protein.faa.gz')
)
}
if (!file.exists('protein.faa.gz')) {
cat('\n', row_index, genomeID_h, 'failed to download seq from FTP.\n')
} else {
cat('\nDONE downloading', row_index, genomeID_h, 'from FTP.\n')
end <- FALSE
start_i <- 1
while (end!=TRUE) {
if (start_i==1) {
# read in at most 100 seqs, starts from start_i (skip start_i-1)
seq_h <- readAAStringSet('protein.faa.gz', nrec=100)
seq_h <- RemoveGaps(seq_h, "all")
cat('\nRunning DetectRepeats on CDS No.', start_i, 'to No.', start_i+length(seq_h)-1, '...\n\n')
result_all <- DetectRepeats(seq_h, processors=8, verbose = TRUE, type = "tandem")
start_i <- start_i + length(seq_h) # next round should starts from 101
} else {
# read in at most 100 seqs, starts from start_i
seq_h <- readAAStringSet('protein.faa.gz', nrec=100, skip=start_i-1)
seq_h <- RemoveGaps(seq_h, "all")
if (length(seq_h)!=0) { # if there is seq
cat('\nRunning DetectRepeats on CDS No.', start_i, 'to No.', start_i+length(seq_h)-1, '...\n\n')
result_h <- DetectRepeats(seq_h, processors=8, verbose = TRUE, type = "tandem")
result_h$Index <- result_h$Index + start_i-1
result_all <- rbind(result_all, result_h)
start_i <- start_i + length(seq_h) # next round should starts from
} else {
end <- TRUE
}
}
}
cat('\nDONE running DetectRepeats on', nrow(result_all), 'CDS...\n\n')
# write output
saveRDS(result_all, file=out_filename, compress = TRUE)
cat('\nSave DetectRepeats result as', out_filename, '\n')
}
}