diff --git a/.editorconfig b/.editorconfig index 63eff3a4..8719a7f9 100644 --- a/.editorconfig +++ b/.editorconfig @@ -30,3 +30,7 @@ indent_style = unset # ignore python [*.{py}] indent_style = unset + +# ignore perl +[*.{pl,pm}] +indent_size = unset diff --git a/bin/FAlite_a93cba2.pm b/bin/FAlite_a93cba2.pm new file mode 100644 index 00000000..bebd8ed5 --- /dev/null +++ b/bin/FAlite_a93cba2.pm @@ -0,0 +1,128 @@ +package FAlite_a93cba2; +use strict; +sub new { + my ($class, $fh) = @_; + if (ref $fh !~ /GLOB/) + {die ref $fh, "\n", "FAlite_a93cba2 ERROR: expect a GLOB reference\n"} + my $this = bless {}; + $this->{FH} = $fh; + while(<$fh>) {last if $_ =~ /\S/} # not supposed to have blanks, but... + my $firstline = $_; + if (not defined $firstline) {warn "FAlite_a93cba2: Empty\n"; return $this} + if ($firstline !~ /^>/) {warn "FAlite_a93cba2: Not FASTA formatted\n"; return $this} + $this->{LASTLINE} = $firstline; + chomp $this->{LASTLINE}; + return $this; +} +sub nextEntry { + my ($this) = @_; + return 0 if not defined $this->{LASTLINE}; + my $fh = $this->{FH}; + my $def = $this->{LASTLINE}; + my @seq; + my $lines_read = 0; + while(<$fh>) { + $lines_read++; + if ($_ =~ /^>/) { + $this->{LASTLINE} = $_; + chomp $this->{LASTLINE}; + last; + } + push @seq, $_; + } + return 0 if $lines_read == 0; + chomp @seq; + my $entry = FAlite_a93cba2::Entry::new($def, \@seq); + return $entry; +} + +package FAlite_a93cba2::Entry; +use overload '""' => 'all'; +sub new { + my ($def, $seqarry) = @_; + my $this = bless {}; + $this->{DEF} = $def; + $this->{SEQ} = join("", @$seqarry); + $this->{SEQ} =~ s/\s//g; # just in case more spaces + return $this; +} +sub def {shift->{DEF}} +sub seq {shift->{SEQ}} +sub all {my $e = shift; return $e->{DEF}."\n".$e->{SEQ}."\n"} + +1; + +__END__ + +=head1 NAME + +FAlite_a93cba2; + +=head1 SYNOPSIS + + use FAlite_a93cba2; + my $fasta = new FAlite_a93cba2(\*STDIN); + while(my $entry = $fasta->nextEntry) { + $entry->def; + $entry->seq; + } + +=head1 DESCRIPTION + +FAlite_a93cba2 is a package for parsing FASTA files and databases. The FASTA format is +widely used in bioinformatics. It consists of a definition line followed by +sequence with an arbitrary number of lines and line lengths. + +A FASTA file looks like this: + + >identifier descriptive text + GAATTC + +A FASTA database looks like this: + + >identifier1 some text describing this entry + GAATTC + ACTAGT + >identifier2 some text describing this entry + AAACCT + GCTAAT + +=head2 Object + +FAlite_a93cba2 has two kinds of objects, the file and the entry. + + my $fasta_file = new FAlite_a93cba2(\*STDIN); # or any other filehandle + $entry = $fasta_file->nextEntry; # single fasta fle + while(my $entry = $fasta_file->nextEntry) { + # canonical form of use for fasta database + } + +The entry has two attributes (def and seq). + + $entry->def; # access the def line + $entry->seq; # access the sequence + "$entry"; # overload to fasta file ($entry->def . "\n" . $entry->seq) + +=head1 AUTHOR + +Ian Korf (ikorf@sapiens.wustl.edu, http://sapiens.wustl.edu/~ikorf) + +=head1 ACKNOWLEDGEMENTS + +This software was developed at the Genome Sequencing Center at Washington +Univeristy, St. Louis, MO. + +=head1 COPYRIGHT + +Copyright (C) 1999 Ian Korf. All Rights Reserved. + +=head1 DISCLAIMER + +This software is provided "as is" without warranty of any kind. + +=cut + + + + + diff --git a/bin/assemblathon_stats_a93cba2.pl b/bin/assemblathon_stats_a93cba2.pl new file mode 100755 index 00000000..d10ba565 --- /dev/null +++ b/bin/assemblathon_stats_a93cba2.pl @@ -0,0 +1,528 @@ +#!/usr/bin/perl +# +# assemblathon_stats.pl +# +# A script to calculate a basic set of metrics from a genome assembly +# +# Author: Keith Bradnam, Genome Center, UC Davis +# This work is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License. +# This software is provided AS IS, without warranty of any kind. + +use strict; +use warnings; +use FAlite_a93cba2; +use Getopt::Long; +use List::Util qw(sum max min); + +############################################### +# +# C o m m a n d l i n e o p t i o n s +# +############################################### + +my $limit; # limit processing of data to first $limit sequences (for quick testing) +my $graph; # produce some output ready for Excel or R +my $csv; # produce CSV output file of results +my $n_limit; # how many N characters should be used to split scaffolds into contigs +my $genome_size; # estimated or known genome size (will be used for some stats) + +GetOptions ("limit=i" => \$limit, + "csv" => \$csv, + "graph" => \$graph, + "n=i" => \$n_limit, + "genome_size=i" => \$genome_size); + +# set defaults +$limit = 1000000000 if (!$limit); +$n_limit = 25 if (!$n_limit); + + +# check we have a suitable input file +my $usage = "Usage: assemblathon_stats.pl +options: + -limit limit analysis to first sequences (useful for testing) + -csv produce a CSV output file of all results + -graph produce a CSV output file of NG(X) values (NG1 through to NG99), suitable for graphing + -n specify how many consecutive N characters should be used to split scaffolds into contigs + -genome_size estimated or known genome size +"; + +die "$usage" unless (@ARGV == 1); +my ($file) = @ARGV; + +############################################### +# +# S o m e G l o b a l v a r i a b l e s +# +############################################### + +my $scaffolded_contigs = 0; # how many contigs that are part of scaffolds (sequences must have $n_limit consecutive Ns) +my $scaffolded_contig_length = 0; # total length of all scaffolded contigs +my $unscaffolded_contigs = 0; # how many 'orphan' contigs, not part of a scaffold +my $unscaffolded_contig_length = 0; # total length of all contigs not part of scaffold +my $w = 60; # formatting width for output +my %data; # data structure to hold all sequence info key is either 'scaffold', 'contig' or intermediate', values are seqs & length arrays +my (@results, @headers); # arrays to store results (for use with -csv option) + + + +# make first loop through file, capture some basic info and add sequences to arrays +process_FASTA($file); + +print "\n---------------- Information for assembly \'$file\' ----------------\n\n"; + +if(defined($genome_size)){ + my $mbp_size = sprintf("%.2f", $genome_size / 1000000); + printf "%${w}s %10s\n", "Assumed genome size (Mbp)", $mbp_size; +} + +# produce scaffold statistics +sequence_statistics('scaffold'); + +# produce a couple of intermediate statistics based on scaffolded contigs vs unscaffolded contigs +sequence_statistics('intermediate'); + +# finish with contig stats +sequence_statistics('contig'); + +# produce CSV output if required +write_csv($file) if ($csv); + +exit(0); + + + +########################################## +# +# +# S U B R O U T I N E S +# +# +########################################## + + +########################################## +# M A I N loop through FASTA file +########################################## + +sub process_FASTA{ + + my ($seqs) = @_; + + my $input; + + # if dealing with gzip file, treat differently + if($seqs =~ m/\.gz$/){ + open($input, "gunzip -c $seqs |") or die "Can't open a pipe to $seqs\n"; + } else{ + open($input, "<", "$seqs") or die "Can't open $seqs\n"; + } + + my $fasta = new FAlite_a93cba2(\*$input); + + # want to keep track of various contig + scaffold counts + my $seq_count = 0; + + while(my $entry = $fasta->nextEntry){ + my $seq = uc($entry->seq); + my $length = length($seq); + $seq_count++; + + # everything gets pushed to scaffolds array + push(@{$data{scaffold}{seqs}},$seq); + push(@{$data{scaffold}{lengths}},$length); + + # if there are not at least 25 consecutive Ns in the sequence we need to split it into contigs + # otherwise the sequence must be a contig itself and it still needs to be put in @contigs array + if ($seq =~ m/N{$n_limit}/){ + + # add length to $scaffolded_contig_length + $scaffolded_contig_length += $length; + + # loop through all contigs that comprise the scaffold + foreach my $contig (split(/N{$n_limit,}/, $seq)){ + next unless my $length = length($contig); + $scaffolded_contigs++; + push(@{$data{contig}{seqs}},$contig); + push(@{$data{contig}{lengths}},$length); + } + } else { + # must be here if the scaffold is actually just a contig (or is a scaffold with < $n_limit Ns) + $unscaffolded_contigs++; + $unscaffolded_contig_length += $length; + push(@{$data{contig}{seqs}},$seq); + push(@{$data{contig}{lengths}},$length); + } + # for testing, just use a few sequences + last if ($seq_count >= $limit); + + } + close($input); +} + + +########################################## +# Calculate basic assembly metrics +########################################## + +sub sequence_statistics{ + my ($type) = @_; + + print "\n"; + + # need descriptions of each result + my $desc; + + # there are just a couple of intermediate level statistics to print + if($type eq 'intermediate'){ + my $total_size = sum(@{$data{scaffold}{lengths}}); + + # now calculate percentage of assembly that is accounted for by scaffolded contigs + my $percent = sprintf("%.1f",($scaffolded_contig_length / $total_size) * 100); + $desc = "Percentage of assembly in scaffolded contigs"; + printf "%${w}s %10s\n", $desc, "$percent%"; + store_results($desc, $percent) if ($csv); + + # now calculate percentage of assembly that is accounted for by unscaffolded contigs + $percent = sprintf("%.1f",($unscaffolded_contig_length / $total_size) * 100); + $desc = "Percentage of assembly in unscaffolded contigs"; + printf "%${w}s %10s\n", $desc, "$percent%"; + store_results($desc, $percent) if ($csv); + + + # statistics that describe N regions that join contigs in scaffolds + + # get number of breaks + my $contig_count = scalar(@{$data{contig}{lengths}}); + my $scaffold_count = scalar(@{$data{scaffold}{lengths}}); + my $average_contigs_per_scaffold = sprintf("%.1f",$contig_count / $scaffold_count); + $desc = "Average number of contigs per scaffold"; + printf "%${w}s %10s\n", $desc, $average_contigs_per_scaffold; + store_results($desc, $average_contigs_per_scaffold) if ($csv); + + # now calculate average length of break between contigs + # just find all runs of Ns in scaffolds (>= $n_limit) and calculate average length + my @contig_breaks; + foreach my $scaffold (@{$data{scaffold}{seqs}}){ + while($scaffold =~ m/(N{$n_limit,})/g){ + push(@contig_breaks, length($1)); + } + } + # set break size to zero if there are no Ns in scaffolds + my $average_break_length; + + if(@contig_breaks == 0){ + $average_break_length = 0; + } else{ + $average_break_length = sum(@contig_breaks) / scalar(@contig_breaks); + } + if($n_limit == 1) { + $desc = "Mean length of breaks (>=${n_limit}N) between contigs in scaffold"; + } else { + $desc = "Mean length of breaks (>=${n_limit}Ns) between contigs in scaffold"; + } + if(length($n_limit)>=5) { + printf "%${w}s %9d\n", $desc, $average_break_length; + } else { + printf "%${w}s %10d\n", $desc, $average_break_length; + } + store_results($desc, $average_break_length) if ($csv); + return(); + } + + + # n + my $count = scalar(@{$data{$type}{lengths}}); + $desc = "Number of ${type}s"; + printf "%${w}s %10d\n", $desc, $count; + store_results($desc, $count) if ($csv); + + + + # more contig details (only for contigs) + if ($type eq 'contig'){ + $desc = "Number of contigs in scaffolds"; + printf "%${w}s %10d\n",$desc, $scaffolded_contigs; + store_results($desc, $scaffolded_contigs) if ($csv); + + $desc = "Number of contigs not in scaffolds"; + printf "%${w}s %10d\n", $desc,$unscaffolded_contigs; + store_results($desc, $unscaffolded_contigs) if ($csv); + } + + + # total size of sequences + my $total_size = sum(@{$data{$type}{lengths}}); + $desc = "Total size of ${type}s"; + printf "%${w}s %10d\n", $desc, $total_size; + store_results($desc, $total_size) if ($csv); + + + # For scaffold data only, can caluclate the percentage of known genome size + # and also the amount of useful sequence + if ($type eq 'scaffold' && defined($genome_size)){ + my $percent = sprintf("%.1f",($total_size / $genome_size) * 100); + $desc = "Total scaffold length as percentage of assumed genome size"; + printf "%${w}s %10s\n", $desc, "$percent%"; + store_results($desc, $percent) if ($csv); + + # Also want to find total fraction of genome (based on estimated size) that is + # in 'non-useful scaffolds', those below average size of vertebrate gene + # (taken to be 25 kbp) + my $useful_length = 25000; + my $sum_useful = 0; + foreach my $length (@{$data{$type}{lengths}}){ + ($sum_useful += $length) if ($length >= $useful_length); + } + # calculate how much non-useful sequence there was + $desc = "Useful amount of $type sequences (>= 25K nt)"; + printf "%${w}s %10d\n", $desc, $sum_useful; + store_results($desc, $sum_useful) if ($csv); + + my $percent_useful = sprintf("%.1f",($sum_useful / $genome_size) * 100); + $desc = "% of estimated genome that is useful"; + printf "%${w}s %10s\n", $desc, "$percent_useful%"; + store_results($desc, $percent_useful) if ($csv); + + } + + + # longest and shortest sequences + my $max = max(@{$data{$type}{lengths}}); + $desc = "Longest $type"; + printf "%${w}s %10d\n", $desc, $max; + store_results($desc, $max) if ($csv); + + my $min = min(@{$data{$type}{lengths}}); + $desc = "Shortest $type"; + printf "%${w}s %10d\n", $desc, $min; + store_results($desc, $min) if ($csv); + + + # find number of sequences above certain sizes + my %sizes_to_shorthand = (1000 => '1K', + 10000 => '10K', + 100000 => '100K', + 1000000 => '1M', + 10000000 => '10M'); + + foreach my $size (qw(1000 10000 100000 1000000 10000000)){ + my $matches = grep { $_ > $size } @{$data{$type}{lengths}}; + my $percent = sprintf("%.1f", ($matches / $count) * 100); + + $desc = "Number of ${type}s > $sizes_to_shorthand{$size} nt"; + printf "%${w}s %10d %5s%%\n", $desc, $matches, $percent; + store_results($desc, $matches) if ($csv); + + $desc = "Percentage of ${type}s > $sizes_to_shorthand{$size} nt"; + store_results($desc, $percent) if ($csv); + } + + + # mean sequence size + my $mean = sprintf("%.0f",$total_size / $count); + $desc = "Mean $type size"; + printf "%${w}s %10d\n", $desc, $mean; + store_results($desc, $mean) if ($csv); + + # median sequence size + my $median = (sort{$a <=> $b} @{$data{$type}{lengths}})[$count/2]; + $desc = "Median $type size"; + printf "%${w}s %10d\n", $desc, $median; + store_results($desc, $median) if ($csv); + + + + ################################################################################## + # + # N50 values + # + # Includes N(x) values, NG(x) (using assumed genome size) + # and L(x) values (number of sequences larger than or equal to N50 sequence size) + ################################################################################## + + # keep track of cumulative assembly size (starting from smallest seq) + my $running_total = 0; + + # want to store all N50-style values from N1..N100. First target size to pass is N1 + my $n_index = 1; + my @n_values; + my $n50_length = 0; + + my $i = 0; + + my $x = $total_size * 0.5; + # start with longest lengths scaffold/contig + foreach my $length (reverse sort{$a <=> $b} @{$data{$type}{lengths}}){ + $i++; + $running_total += $length; + + # check the current sequence and all sequences shorter than current one + # to see if they exceed the current NX value + while($running_total > int (($n_index / 100) * $total_size)){ + if ($n_index == 50){ + $n50_length = $length; + $desc = "N50 $type length"; + printf "%${w}s %10d\n", $desc, $length; + store_results($desc, $length) if ($csv); + + # L50 = number of scaffolds/contigs that are longer than or equal to the N50 size + $desc = "L50 $type count"; + printf "%${w}s %10d\n","L50 $type count", $i; + store_results($desc, $i) if ($csv); + } + $n_values[$n_index] = $length; + $n_index++; + } + } + + my @ng_values; + + # do we have an estimated/known genome size to work with? + if(defined($genome_size)){ + my $ng_index = 1; + my $ng50_length = 0; + + $running_total = 0; + $i = 0; + + foreach my $length (reverse sort{$a <=> $b} @{$data{$type}{lengths}}){ + $i++; + $running_total += $length; + + # now do the same for NG values, using assumed genome size + while($running_total > int (($ng_index / 100) * $genome_size)){ + if ($ng_index == 50){ + $ng50_length = $length; + $desc = "NG50 $type length"; + printf "%${w}s %10d\n", $desc, $length; + store_results($desc, $length) if ($csv); + + $desc = "LG50 $type count"; + printf "%${w}s %10d\n", $desc, $i; + store_results($desc, $i) if ($csv); + } + $ng_values[$ng_index] = $length; + $ng_index++; + } + } + + # calculate N50/NG50 difference + my $n50_diff = abs($ng50_length - $n50_length); + $desc = "N50 $type - NG50 $type length difference"; + printf "%${w}s %10d\n", $desc, $n50_diff; + store_results($desc, $n50_diff) if ($csv); + + } + # add final value to @n_values and @ng_values which will just be the shortest sequence +# $n_values[100] = $min; +# $ng_values[100] = $min; + + + # base frequencies + my %bases; + + my $seq = join('',@{$data{$type}{seqs}}); + my $length = length($seq); + + # count mononucleotide frequencies + $bases{A} = ($seq =~ tr/A/A/); + $bases{C} = ($seq =~ tr/C/C/); + $bases{G} = ($seq =~ tr/G/G/); + $bases{T} = ($seq =~ tr/T/T/); + $bases{N} = ($seq =~ tr/N/N/); + + my $base_count = 0; + foreach my $base (qw(A C G T N)){ + my $percent = sprintf("%.2f", ($bases{$base} / $length) * 100); + $desc = "$type %$base"; + printf "%${w}s %10s\n", $desc, $percent; + store_results($desc, $percent) if ($csv); + $base_count += $bases{$base}; + } + + # calculate remainder ('other) in case there are other characters present + my $other = $length - $base_count; + my $percent = sprintf("%.2f", ($other / $length) * 100); + $desc = "$type %non-ACGTN"; + printf "%${w}s %10s\n",$desc, $percent; + store_results($desc, $percent) if ($csv); + + $desc = "Number of $type non-ACGTN nt"; + printf "%${w}s %10d\n",$desc, $other; + store_results($desc, $other) if ($csv); + + + # anything to dump for graphing? + if($graph){ + + # create new output file name + my $file_name = $file; + $file_name =~ s/\.gz$//; + $file_name =~ s/\.(fa|fasta)$//; + $file_name .= ".${type}.NG50.csv"; + + open(my $out, ">", "$file_name") or die "Can't create $file_name\n"; + print $out join (',',"Assembly",1..99), "\n"; + + # make some guesses of what might constitute the unique assembly ID + my $assembly_ID = $file; + ($assembly_ID) = $file =~ m/^([A-Z]\d{1,2})_/ if ($file =~ m/^[A-Z]\d{1,2}_/); + ($assembly_ID) = $file =~ m/^((bird|snake|fish)_\d+(C|E))_/ if ($file =~ m/^(bird|snake|fish)_\d+C|E_/); + + # CSV file, with filename in first column + print $out "$assembly_ID"; + + for (my $i = 1; $i < 100; $i++){ + # higher NG values might not be present if assembly is poor + if (defined $ng_values[$i]){ + print $out ",$ng_values[$i]"; + } else{ + print $out ",0"; + } + } + print $out "\n"; + close($out); + } +} + +# simple routine to add results to a pair of arrays that will be used for printing results later on +# if -csv option is used +sub store_results{ + my ($desc, $result) = @_; + + push(@headers,$desc); + push(@results,$result); +} + +sub write_csv{ + my ($file) = @_; + + # create new output file name + my $output = $file; + $output =~ s/\.gz$//; + $output =~ s/\.(fa|fasta)$//; + $output .= ".csv"; + + # make some guesses of what might constitute the unique assembly ID + my $assembly_ID = $file; + ($assembly_ID) = $file =~ m/^([A-Z]\d{1,2})_/ if ($file =~ m/^[A-Z]\d{1,2}_/); + ($assembly_ID) = $file =~ m/^((bird|snake|fish)_\d+(C|E))_/ if ($file =~ m/^(bird|snake|fish)_\d+C|E_/); + + open(my $out, ">", $output) or die "Can't create $output\n"; + + print $out "Assembly,"; + foreach my $header (@headers){ + print $out "$header,"; + } + print $out "\n"; + + print $out "$assembly_ID,"; + foreach my $result (@results){ + print $out "$result,"; + } + print $out "\n"; + + + close($out); +} diff --git a/modules/local/assemblathon_stats.nf b/modules/local/assemblathon_stats.nf new file mode 100644 index 00000000..3285e08c --- /dev/null +++ b/modules/local/assemblathon_stats.nf @@ -0,0 +1,55 @@ +process ASSEMBLATHON_STATS { + tag "${asm_tag}" + label "process_single" + + container "${ workflow.containerEngine == 'singularity' || workflow.containerEngine == 'apptainer' ? + 'https://depot.galaxyproject.org/singularity/ubuntu:20.04': + 'quay.io/nf-core/ubuntu:20.04' }" + + input: + tuple val(asm_tag), path(fasta_file) + val n_limit + + output: + path "${asm_tag}_stats.csv" , emit: stats + path 'versions.yml' , emit: versions + + script: + def VERSION = "github/PlantandFoodResearch/assemblathon2-analysis/a93cba2" + """ + paths_to_check=\$(printf "%s\\n" \$(echo \$PATH | tr ':' ' ') \\ + | xargs -I {} find {} -maxdepth 0 -print 2>/dev/null \\ + | grep -v '^\$' \\ + | grep -v '/sbin' \\ + | xargs + ) + + falite_path="\$(find \$paths_to_check -name FAlite_a93cba2.pm)" + + ln -s "\$falite_path" FAlite_a93cba2.pm + + PERL5LIB=./ assemblathon_stats_a93cba2.pl \\ + -n $n_limit \\ + -csv \\ + "${fasta_file}" + + csv_file_name=\$(ls | grep "csv") + mv \$csv_file_name "${asm_tag}_stats.csv" + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + assemblathon_stats: $VERSION + END_VERSIONS + """ + + stub: + def VERSION = "github/PlantandFoodResearch/assemblathon2-analysis/a93cba2" + """ + touch "${asm_tag}_stats.csv" + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + assemblathon_stats: $VERSION + END_VERSIONS + """ +} diff --git a/workflows/assemblyqc.nf b/workflows/assemblyqc.nf index 0df1b2b0..34f71156 100644 --- a/workflows/assemblyqc.nf +++ b/workflows/assemblyqc.nf @@ -31,6 +31,7 @@ include { GT_STAT } from '../modules/pfr/gt/stat/main' include { GFF3_VALIDATE } from '../subworkflows/pfr/gff3_validate/main' include { NCBI_FCS_ADAPTOR } from '../modules/local/ncbi_fcs_adaptor' include { NCBI_FCS_GX } from '../subworkflows/local/ncbi_fcs_gx' +include { ASSEMBLATHON_STATS } from '../modules/local/assemblathon_stats' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -243,6 +244,15 @@ workflow ASSEMBLYQC { [ tag, fa ] } + // MODULE: ASSEMBLATHON_STATS + ASSEMBLATHON_STATS( + ch_clean_assembly, + params.assemblathon_stats_n_limit + ) + + ch_assemblathon_stats = ASSEMBLATHON_STATS.out.stats + ch_versions = ch_versions.mix(ASSEMBLATHON_STATS.out.versions.first()) + // MODULE: CUSTOM_DUMPSOFTWAREVERSIONS CUSTOM_DUMPSOFTWAREVERSIONS ( ch_versions.unique().collectFile(name: 'collated_versions.yml')