-
Notifications
You must be signed in to change notification settings - Fork 0
/
computeRestrictionMap.R
83 lines (73 loc) · 3.75 KB
/
computeRestrictionMap.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
# =============================================================================.
# This script computes the genomic coordinates of all restriction sites for a
# given restriction enzyme. Restriction sites are defined by the short DNA motif
# (4 to 6bp in common protocols) specifically targeted by the enzyme.
# =============================================================================.
# LIBRARIES ####################################################################
message("Loading R/Bioconductor packages")
suppressPackageStartupMessages(library("MRA.TA"))
# SCRIPT PARAMETERS ############################################################
# =============================================================================.
script.args = matrix(c(
# Column 1: long option name
# Column 2: short option name
# Column 3: 0=no argument, 1=required argument, 2=optional argument
# Column 4: data type (logical, integer, double, complex, character)
# Column 5: a brief description of the purpose of the option
# Column 6: default value
'enzyme.name', 'n', 1, 'character', "name of the restriction enzyme", "",
'enzyme.motif', 'm', 1, 'character', "DNA motif of the restriction enzyme", "",
'genome.sequence.path', 's', 1, 'character', "path to genome sequence (fasta)", "",
'genome.id', 'i', 1, 'character', "identifier of the genome assembly", "",
'restriction.data.path', 'r', 2, 'character', "path to folder for storage of restriction data, default = Processed_Data/RestrictionMap/", "Processed_Data/RestrictionMap/"
), byrow=TRUE, ncol=6)
Cfg <- processArgs(script.args)
Cfg$script.args <- NULL
# -----------------------------------------------------------------------------.
# DEBUG VALUES (only usefull for the MiMB.4C package development in RStudio)
# if(is.na(Cfg$script.name)) {
# Cfg$genome.id <- "dm6"
# Cfg$enzyme.name <- "DpnII"
# Cfg$enzyme.motif <- "GATC"
# Cfg$genome.sequence.path <- "Genome_Data/UCSC_dm6/genome.fa.gz"
# Cfg$restriction.data.path <- "Processed_Data/RestrictionMap/"
# }
# =============================================================================.
# Parameters validation
lbl.lst <- c("genome.id", "enzyme.name", "enzyme.motif", "genome.sequence.path")
for(lbl in lbl.lst) {
if(Cfg[[lbl]]=="") {
msg <- paste("Missing", lbl, "get help using: Rscript", Cfg$script.name, "-h")
stop(msg, "\n\n")
}
}
# -----------------------------------------------------------------------------.
attach(Cfg)
# -----------------------------------------------------------------------------.
# Verify that genome sequence file exists
verifyInputFiles(genome.sequence.path)
# =============================================================================.
# Parameter updates
enzyme <- list(name=enzyme.name, motif=enzyme.motif)
cmd <- paste('mkdir -p', restriction.data.path)
system(cmd, intern=F)
out.file.template <- paste(restriction.data.path, "/", genome.id, "_", enzyme$name, sep="")
# PROCESSING ###################################################################
# Load genome sequence
genome.seq <- readDNAStringSet(genome.sequence.path, "fasta")
# Filter out non canonical sequences
seqlist <- names(genome.seq)
seqlist <- seqlist[! grepl("(^chrUn_)|(_random$)", seqlist, ignore.case=T)]
# Compute restriction sites and restriction fragments
res <- computeRestrictionMap(
genome.seq, enzyme$motif, output.file=out.file.template, seqlist=seqlist
)
# =============================================================================.
# Compress restriction map files
# gzip Processed_Data/RestrictionMap/*.txt
cmd <- paste("gzip", res$sites)
system(cmd, intern=F)
cmd <- paste("gzip", res$fragments)
system(cmd, intern=F)
# =============================================================================.
detach(Cfg)