-
Notifications
You must be signed in to change notification settings - Fork 32
/
Copy pathCreatePdbToUniProtMappingFile.java
142 lines (115 loc) · 5.62 KB
/
CreatePdbToUniProtMappingFile.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
package edu.sdsc.mmtf.spark.applications;
import java.io.File;
import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.Date;
import org.apache.commons.cli.BasicParser;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.io.FileUtils;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import edu.sdsc.mmtf.spark.datasets.PdbToUniProt;
/**
* Builds or updates a dataset of PDB to UniProt residue number mappings from
* the SIFTS project. Building a new dataset is very slow and may take more
* than one day. Preferably, use the update option (-u) to update the cached dataset.
*
* For more information about SIFTS see:
* <p>
* The "Structure Integration with Function, Taxonomy and Sequence"
* (<a href="https://www.ebi.ac.uk/pdbe/docs/sifts/overview.html">SIFTS</a>) is
* the authoritative source of up-to-date residue-level mapping to UniProt.
*
* @author Peter Rose
* @since 0.2.0
*
*/
public class CreatePdbToUniProtMappingFile {
public static void main(String[] args) throws IOException, InterruptedException {
// process command line options (defaults are provided)
CommandLine cmd = getCommandLine(args);
String outputFile = cmd.getOptionValue("output-file");
boolean build = cmd.hasOption("build");
boolean update = cmd.hasOption("update");
// these default options for fileFormat and compressionCodec
// provide the best compression
String fileFormat = cmd.getOptionValue("file-format", "orc");
String compressionCodec = cmd.getOptionValue("compression-codec", "lzo");
SparkSession spark = SparkSession.builder()
.master("local[*]")
.appName(CreatePdbToUniProtMappingFile.class.getSimpleName())
.getOrCreate();
String timeStamp = new SimpleDateFormat("yyyyMMddHHmmss").format(new Date());
long t1 = System.nanoTime();
String dirName = outputFile + "_" + timeStamp + "_tmp";
String fileName = outputFile + "_" + timeStamp + "." + fileFormat + "." + compressionCodec;
if (build) {
// create a new mapping file from scratch
PdbToUniProt.buildDataset(dirName, "orc", "lzo");
} else if (update) {
// create an updated mapping file from the cached version
PdbToUniProt.updateDataset(dirName, "orc", "lzo");
}
long t2 = System.nanoTime();
System.out.println("Time to build/update dataset: " + (t2-t1)/1E9 + " sec.");
// By default, spark creates a directory of files.
// For convenience, coalesce the data into a single file.
Dataset<Row> ds = spark.read().orc(dirName);
long count = ds.count();
int partitions = 1;
DatasetFileConverter.saveDataset(ds, partitions, fileFormat, compressionCodec, fileName);
FileUtils.deleteDirectory(new File(dirName));
System.out.println(count + " records saved to: " + fileName);
long t3 = System.nanoTime();
System.out.println("Time to reformat data: " + (t3-t2)/1E9 + " sec.");
spark.stop();
}
private static CommandLine getCommandLine(String[] args) {
Options options = new Options();
options.addOption("h", "help", false, "help");
options.addOption("o", "output-file", true, "path to output file");
options.addOption("b", "build", false, "build a new dataset (slow!)");
options.addOption("u", "update", false, "update cached dataset");
options.addOption("f", "file-format", true, "parquet, orc");
options.addOption("c", "compression-codec", true, "gzip or snappy for parquet, zlib or lzo for orc");
CommandLineParser parser = new BasicParser();
CommandLine cmd = null;
try {
cmd = parser.parse(options, args);
} catch (ParseException e) {
System.out.println("ERROR: invalid command line arguments: " + e.getMessage());
HelpFormatter formatter = new HelpFormatter();
formatter.printHelp(CreatePdbToUniProtMappingFile.class.getSimpleName(), options);
System.exit(-1);
}
if (cmd.hasOption("help")) {
HelpFormatter formatter = new HelpFormatter();
formatter.printHelp(CreatePdbToUniProtMappingFile.class.getSimpleName(), options);
System.exit(1);
}
if (!cmd.hasOption('b') && !cmd.hasOption('u')) {
System.out.println("ERROR: use either -u or -b option");
HelpFormatter formatter = new HelpFormatter();
formatter.printHelp(CreatePdbToUniProtMappingFile.class.getSimpleName(), options);
System.exit(-1);
}
if (cmd.hasOption('b') && cmd.hasOption('u')) {
System.out.println("ERROR: use either -u or -b option");
HelpFormatter formatter = new HelpFormatter();
formatter.printHelp(CreatePdbToUniProtMappingFile.class.getSimpleName(), options);
System.exit(-1);
}
if (!cmd.hasOption("output-file")) {
System.err.println("ERROR: no output file specified!");
HelpFormatter formatter = new HelpFormatter();
formatter.printHelp(CreatePdbToUniProtMappingFile.class.getSimpleName(), options);
System.exit(1);
}
return cmd;
}
}