Skip to content

Commit

Permalink
Batch xml export into chunks of 5000 records
Browse files Browse the repository at this point in the history
  • Loading branch information
cudevmaxwell committed Oct 4, 2018
1 parent 1811e68 commit 5f1d2e7
Showing 1 changed file with 51 additions and 33 deletions.
84 changes: 51 additions & 33 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,10 @@ import (
)

var etdCSVFilePath = flag.String("in", "etd-output.csv", "Path to etd CSV file.")
var reportFilePath = flag.String("report", "report.csv", "Path to which the report csv file will be written.")
var crossrefOutputFilePath = flag.String("out", "crossref.xml", "Path to which the output XML file will be written.")
var prefix = flag.String("prefix", "", "DOI prefix.")
var depositorName = flag.String("depositor", "", "Name of the organization registering the DOIs. The name placed in this element should match the name under which a depositing organization has registered with CrossRef.")
var depositorEmail = flag.String("email", "", "Email address to which batch success and/or error messages are sent. It is recommended that this address be unique to a position within the organization submitting data (e.g. \"doi@...\") rather than unique to a person. In this way, the alias for delivery of this mail can be changed as responsibility for submission of DOI data within the organization changes from one person to another.")
var registrant = flag.String("registrant", "", "The organization that owns the information being registered.")
var timeFlag = flag.Int64("timestamp", 0, "An int64 representation of the nanoseconds since the epoch. Used to set the DOI submission batch and timestamp.")
var starting = flag.Int("starting", 1, "The starting value for the incrementing integer section of the DOI pattern 'prefix/etd/year-intvalue'")

func main() {
Expand All @@ -41,35 +38,20 @@ func main() {
log.Fatalln("prefix required")
}

var runAtTime time.Time

if *timeFlag == 0 {
runAtTime = time.Now().UTC()
} else {
runAtTime = time.Unix(0, *timeFlag)
}
dois := make(map[string]bool)

// Open the ETD export from CURVE.
etdCSVFile, err := os.Open(*etdCSVFilePath)
if err != nil {
log.Fatal(err)
}
etdCSVReader := csv.NewReader(etdCSVFile)

templateData := new(TemplateData)
etdCSVReader := csv.NewReader(etdCSVFile)

templateData.HeadData = HeadData{
DOIBatch: runAtTime.Unix(),
Timestamp: runAtTime.UnixNano(),
DepositorName: *depositorName,
DepositorEmail: *depositorEmail,
Registrant: *registrant,
}
dissertations := []*Dissertation{}

lineNumber := 0

dois := make(map[string]bool)

for {
lineNumber = lineNumber + 1

Expand Down Expand Up @@ -160,26 +142,62 @@ func main() {
dois[dissertation.DOI] = true
}

templateData.BodyData.Dissertations = append(templateData.BodyData.Dissertations, dissertation)
dissertations = append(dissertations, dissertation)
}

output, err := os.Create(*crossrefOutputFilePath)
if err != nil {
log.Fatalln(err)
fullBatches := len(dissertations) / 5000
remainder := len(dissertations) % 5000

batches := []*TemplateData{}

for i := 0; i < fullBatches; i++ {
templateData := new(TemplateData)
runAt := time.Now().UTC()
templateData.HeadData = HeadData{
DOIBatch: runAt.Unix(),
Timestamp: runAt.UnixNano(),
DepositorName: *depositorName,
DepositorEmail: *depositorEmail,
Registrant: *registrant,
}
templateData.BodyData.Dissertations = dissertations[i*5000:((i+1)*5000)]
batches = append(batches, templateData)
time.Sleep(1 * time.Second)
}
defer output.Close()

report, err := os.Create(*reportFilePath)
if err != nil {
log.Fatalln(err)
if remainder != 0 {
templateData := new(TemplateData)
runAt := time.Now().UTC()
templateData.HeadData = HeadData{
DOIBatch: runAt.Unix(),
Timestamp: runAt.UnixNano(),
DepositorName: *depositorName,
DepositorEmail: *depositorEmail,
Registrant: *registrant,
}
templateData.BodyData.Dissertations = dissertations[fullBatches*5000:]
batches = append(batches, templateData)
}

for i, templateData := range batches {
output, err := os.Create(fmt.Sprintf("%v-crossref-%v.xml", time.Now().UTC().Format("2006-01-02"), i+1))
if err != nil {
log.Fatalln(err)
}
defer output.Close()

t := template.Must(template.New("template").Parse(templateSkeleton))
err = t.Execute(output, &templateData)
if err != nil {
log.Fatalln(err)
}
}
defer report.Close()

t := template.Must(template.New("template").Parse(templateSkeleton))
err = t.Execute(output, &templateData)
report, err := os.Create(fmt.Sprintf("%v-report.csv", time.Now().UTC().Format("2006-01-02")))
if err != nil {
log.Fatalln(err)
}
defer report.Close()

w := csv.NewWriter(report)

Expand All @@ -188,7 +206,7 @@ func main() {
log.Fatalln("Error writing to csv:", err)
}

for _, dissertation := range templateData.BodyData.Dissertations {
for _, dissertation := range dissertations {
err = w.Write([]string{dissertation.UUID, dissertation.DOI})
if err != nil {
log.Fatalln("Error writing to csv:", err)
Expand Down

0 comments on commit 5f1d2e7

Please sign in to comment.