-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathseqhasher.go
325 lines (279 loc) · 11.1 KB
/
seqhasher.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
// This file is part of SeqHasher program (by Vladimir Mikryukov)
// and is licensed under GNU GPL-3.0-or-later.
// See the LICENSE file in the root of the source tree
// or <http://www.gnu.org/licenses/gpl-3.0.html>.
package main
import (
"bufio"
"bytes"
"crypto/md5"
"crypto/sha1"
"encoding/hex"
"flag"
"fmt"
"io"
"log"
"os"
"strings"
"github.com/shenwei356/bio/seq"
"github.com/shenwei356/bio/seqio/fastx"
"github.com/cespare/xxhash/v2"
"github.com/go-faster/city"
"github.com/spaolacci/murmur3"
"github.com/zeebo/blake3"
"golang.org/x/crypto/sha3"
"github.com/fatih/color"
"github.com/will-rowe/nthash"
)
const (
version = "1.1.1" // Version of the program
defaultHashType = "sha1" // Default hash type
)
var supportedHashTypes = []string{"sha1", "sha3", "md5", "xxhash", "cityhash", "murmur3", "nthash", "blake3"}
// Configuration structure (flags)
type config struct {
headersOnly bool
hashTypes []string
noFileName bool
caseSensitive bool
inputFileName string
outputFileName string
nameOverride string
showVersion bool
}
func main() {
if err := run(os.Stdout); err != nil {
log.Fatalf("%v", err)
}
}
func run(w io.Writer) error {
// Disable sequence validation
seq.ValidateSeq = false
cfg, err := parseFlags()
if err != nil {
return err
}
if cfg.showVersion {
fmt.Fprintf(w, "SeqHasher %s\n", version)
return nil
}
if cfg.inputFileName == "" {
printUsage(w)
return nil
}
input, err := getInput(cfg.inputFileName)
if err != nil {
return fmt.Errorf("Error opening input: %v", err)
}
defer input.Close()
output := w
if cfg.outputFileName != "" && cfg.outputFileName != "-" {
outputFile, err := getOutput(cfg.outputFileName)
if err != nil {
return fmt.Errorf("Error opening output: %v", err)
}
defer outputFile.Close()
output = outputFile
}
return processSequences(input, output, cfg)
}
func parseFlags() (config, error) {
cfg := config{}
flag.BoolVar(&cfg.headersOnly, "headersonly", false, "Output only headers")
flag.BoolVar(&cfg.headersOnly, "o", false, "Output only headers (shorthand)")
var hashTypesString string
flag.StringVar(&hashTypesString, "hash", defaultHashType, "Hash type(s) (comma-separated: sha1, sha3, md5, xxhash, cityhash, murmur3, nthash, blake3)")
flag.StringVar(&hashTypesString, "H", defaultHashType, "Hash type(s) (shorthand)")
flag.BoolVar(&cfg.noFileName, "nofilename", false, "Do not include file name in output")
flag.BoolVar(&cfg.noFileName, "n", false, "Do not include file name in output (shorthand)")
flag.BoolVar(&cfg.caseSensitive, "casesensitive", false, "Case-sensitive hashing")
flag.BoolVar(&cfg.caseSensitive, "c", false, "Case-sensitive hashing (shorthand)")
flag.StringVar(&cfg.nameOverride, "name", "", "Override input file name in output")
flag.StringVar(&cfg.nameOverride, "f", "", "Override input file name in output (shorthand)")
flag.BoolVar(&cfg.showVersion, "version", false, "Show version information")
flag.BoolVar(&cfg.showVersion, "v", false, "Show version information (shorthand)")
flag.Usage = func() {
printUsage(os.Stderr)
}
flag.Parse()
cfg.inputFileName = flag.Arg(0)
cfg.outputFileName = flag.Arg(1)
// Parse hash types
cfg.hashTypes = strings.Split(hashTypesString, ",")
for _, ht := range cfg.hashTypes {
if !isValidHashType(strings.TrimSpace(ht)) {
return config{}, fmt.Errorf("Invalid hash type: %s. Supported types are: %s", ht, strings.Join(supportedHashTypes, ", "))
}
}
return cfg, nil
}
func isValidHashType(hashType string) bool {
for _, supported := range supportedHashTypes {
if hashType == supported {
return true
}
}
return false
}
func getInput(fileName string) (io.ReadCloser, error) {
if fileName == "" || fileName == "-" {
return os.Stdin, nil
}
return os.Open(fileName)
}
func getOutput(fileName string) (io.WriteCloser, error) {
if fileName == "" || fileName == "-" {
return os.Stdout, nil
}
return os.Create(fileName)
}
func printUsage(w io.Writer) {
if len(os.Args) > 1 && (os.Args[1] == "-h" || os.Args[1] == "--help") {
fmt.Fprintf(w, "\n%s%s%s\n",
color.HiGreenString("SeqHasher"),
color.WhiteString(" : "),
color.HiMagentaString("DNA Sequence Hashing Tool"))
fmt.Fprintf(w, "%s %s\n", color.HiCyanString("version:"), color.WhiteString(version))
fmt.Fprintln(w, color.WhiteString("====================================="))
fmt.Fprintln(w, color.HiCyanString("Usage:"))
fmt.Fprintf(w, " %s\n", color.WhiteString("seqhasher [options] <input_file> [output_file]"))
fmt.Fprintln(w, color.HiCyanString("\nOverview:"))
fmt.Fprintln(w, color.WhiteString(" SeqHasher takes DNA sequences from a FASTA/FASTQ file, computes a hash digest for each sequence,"))
fmt.Fprintln(w, color.WhiteString(" and generates an output file with modified headers."))
fmt.Fprintln(w, color.WhiteString(" For input/output via stdin/stdout, use '-' instead of the file name."))
fmt.Fprintln(w, color.HiCyanString("\nOptions:"))
fmt.Fprintf(w, " %s, %s %s\n", color.HiMagentaString("-o"), color.HiMagentaString("--headersonly"), color.WhiteString(" Output only sequence headers, excluding the sequences themselves"))
fmt.Fprintf(w, " %s, %s %s\n", color.HiMagentaString("-H"), color.HiMagentaString("--hash <type1,type2,...>"), color.WhiteString("Hash algorithm(s): sha1 (default), sha3, md5, xxhash, cityhash, murmur3, nthash, blake3"))
fmt.Fprintf(w, " %s, %s %s\n", color.HiMagentaString("-c"), color.HiMagentaString("--casesensitive"), color.WhiteString("Take into account sequence case. By default, sequences are converted to uppercase"))
fmt.Fprintf(w, " %s, %s %s\n", color.HiMagentaString("-n"), color.HiMagentaString("--nofilename"), color.WhiteString(" Omit the file name from the sequence header"))
fmt.Fprintf(w, " %s, %s %s\n", color.HiMagentaString("-f"), color.HiMagentaString("--name <text>"), color.WhiteString(" Replace the input file's name in the header with <text>"))
fmt.Fprintf(w, " %s, %s %s\n", color.HiMagentaString("-v"), color.HiMagentaString("--version"), color.WhiteString(" Print the version of the program and exit"))
fmt.Fprintf(w, " %s, %s %s\n", color.HiMagentaString("-h"), color.HiMagentaString("--help"), color.WhiteString(" Show this help message and exit"))
fmt.Fprintln(w, color.HiCyanString("\nArguments:"))
fmt.Fprintf(w, " %s %s\n", color.HiMagentaString("<input_file>"), color.WhiteString(" Path to the input FASTA/FASTQ file (supports gzip, zstd, xz, or bzip2 compression)"))
fmt.Fprintf(w, " %s\n", color.WhiteString(" or '-' for standard input (stdin)"))
fmt.Fprintf(w, " %s %s\n", color.HiMagentaString("[output_file]"), color.WhiteString(" Path to the output file or '-' for standard output (stdout)"))
fmt.Fprintln(w, color.WhiteString(" If omitted, output is sent to stdout."))
fmt.Fprintln(w, color.HiCyanString("\nExamples:"))
fmt.Fprintln(w, color.WhiteString(" seqhasher input.fasta.gz output.fasta"))
fmt.Fprintln(w, color.WhiteString(" cat input.fasta | seqhasher --name 'Sample' --hash xxhash - - > output.fasta"))
fmt.Fprintln(w, color.WhiteString(" seqhasher --headersonly --nofilename --hash sha1,nthash input.fa.gz - > headers.txt"))
fmt.Fprintln(w, color.WhiteString("\nFor more information, visit the GitHub repository:"))
fmt.Fprintln(w, color.WhiteString("https://github.com/vmikk/seqhasher"))
} else {
fmt.Fprintf(w, "SeqHasher v%s\n", version)
fmt.Fprintf(w, "Usage: %s [options] <input_file> [output_file]\n", os.Args[0])
fmt.Fprintf(w, "Options:\n")
flag.PrintDefaults()
fmt.Fprintf(w, "\nSupported hash types: %s\n", strings.Join(supportedHashTypes, ", "))
fmt.Fprintf(w, "If input_file is '-' or omitted, reads from stdin.\n")
fmt.Fprintf(w, "If output_file is '-' or omitted, writes to stdout.\n")
fmt.Fprintf(w, "\nFor more detailed help, use -h or --help.\n")
}
}
func processSequences(input io.Reader, output io.Writer, cfg config) error {
writer := bufio.NewWriter(output)
defer writer.Flush()
inputFileName := cfg.inputFileName
if cfg.nameOverride != "" {
inputFileName = cfg.nameOverride
} else if inputFileName == "-" {
cfg.noFileName = true // Skip filename for stdin unless overridden
}
reader, err := fastx.NewReaderFromIO(seq.DNA, bufio.NewReader(input), fastx.DefaultIDRegexp)
if err != nil {
return fmt.Errorf("Failed to create reader: %v", err)
}
defer reader.Close()
for {
record, err := reader.Read()
if err != nil {
if err == io.EOF {
break
}
return fmt.Errorf("Error reading record: %v", err)
}
seq := record.Seq.Seq
// Strip all whitespace characters from sequence before processing
// (as defined by Unicode's White Space property, which includes
// '\t', '\n', '\v', '\f', '\r', ' ', U+0085 (NEL), U+00A0 (NBSP)
seq = bytes.Join(bytes.Fields(seq), nil)
// Convert sequence to uppercase if case-insensitive hashing is enabled
if !cfg.caseSensitive {
seq = bytes.ToUpper(seq)
}
record.Seq.Seq = seq // Update the sequence in-place
// Compute hashes
hashes := make([]string, 0, len(cfg.hashTypes))
for _, hashType := range cfg.hashTypes {
hashFunc := getHashFunc(hashType)
hashes = append(hashes, hashFunc(seq))
}
// Modify header in-place
if cfg.noFileName {
if len(hashes) > 0 {
record.Name = []byte(fmt.Sprintf("%s;%s", strings.Join(hashes, ";"), record.Name))
}
} else {
if len(hashes) > 0 {
record.Name = []byte(fmt.Sprintf("%s;%s;%s", inputFileName, strings.Join(hashes, ";"), record.Name))
} else {
record.Name = []byte(fmt.Sprintf("%s;%s", inputFileName, record.Name))
}
}
if cfg.headersOnly {
if _, err := fmt.Fprintf(writer, "%s\n", record.Name); err != nil {
return fmt.Errorf("Error writing header: %v", err)
}
} else {
if _, err := writer.Write(record.Format(0)); err != nil {
return fmt.Errorf("Error writing record: %v", err)
}
}
}
return writer.Flush()
}
// getHashFunc returns a function that takes a byte slice and returns a hex string
// of the hash based on the specified hash type.
func getHashFunc(hashType string) func([]byte) string {
return func(data []byte) string {
if len(data) == 0 {
log.Printf("Error: Empty DNA sequence provided, resulting in an empty hash.")
return ""
}
switch hashType {
case "sha1":
hash := sha1.Sum(data)
return hex.EncodeToString(hash[:])
case "sha3":
hash := sha3.Sum512(data)
return hex.EncodeToString(hash[:])
case "md5":
hash := md5.Sum(data)
return hex.EncodeToString(hash[:])
case "xxhash":
hash := xxhash.Sum64(data)
return fmt.Sprintf("%016x", hash)
case "cityhash":
hash := city.Hash128(data)
return fmt.Sprintf("%016x%016x", hash.High, hash.Low)
case "murmur3":
h1, h2 := murmur3.Sum128(data)
return fmt.Sprintf("%016x%016x", h1, h2)
case "nthash":
hasher, err := nthash.NewHasher(&data, uint(len(data)))
if err != nil {
log.Printf("Error creating ntHash hasher: %v", err)
return ""
}
hash, _ := hasher.Next(false) // false for non-canonical hash
return fmt.Sprintf("%016x", hash)
case "blake3":
hash := blake3.Sum256(data)
return hex.EncodeToString(hash[:])
default: // Default to SHA1
hash := sha1.Sum(data)
return hex.EncodeToString(hash[:])
}
}
}