#!/usr/bin/env perl use strict; use warnings; use FileHandle; use Getopt::Long; use File::Basename; use POSIX qw(strftime); my $A_NCBI_LIST = ""; my $A_CURRENT = ""; my $A_OUTDIR = "assembly_stats_$$"; my $A_FORCE = 0; my $A_VERBOSE = 1; my $A_SILENT = 0; my $A_HELP = 0; our $PRG_NAME = basename($0); my $result = &GetOptions("list=s" => \$A_NCBI_LIST, "prev=s" => \$A_CURRENT, "outdir=s" => \$A_OUTDIR, "force" => \$A_FORCE, "s" => \$A_SILENT, "h" => \$A_HELP, ); if (!$result || $A_HELP) { usage(); } if($A_SILENT) { $A_VERBOSE=0; } my $fh_genomes = new FileHandle($A_NCBI_LIST, "r"); if (!defined $fh_genomes) { usage("[Error] Cannot open $A_NCBI_LIST in [r] mode."); } my $fh_current = new FileHandle($A_CURRENT, "r"); if (!defined $fh_current) { usage("[Error] Cannot open $A_CURRENT in [r] mode."); } if ( !$A_FORCE && -d $A_OUTDIR ) { usage("[Error] Directory $A_OUTDIR already exists."); } if ( $A_OUTDIR eq "" ) { usage("[Error] Cannot create directory [parameter -outdir]."); } system ("mkdir -p $A_OUTDIR"); my $fhreport = new FileHandle("$A_OUTDIR/assembly_stats", "w"); $fhreport->autoflush(1); # Read existing table my %known_genomes = (); my ($cpt, $line) = (0, ""); warn "Load the list of already downloaded genomes...\n" if $A_VERBOSE; while ($line = $fh_current->getline()) { print $fhreport $line; chomp($line); my @fields = split(/;/, $line); $known_genomes{$fields[1]}=1; $cpt++; } warn "found $cpt already downloaded genomes.\n" if $A_VERBOSE; # Load genomes list, download assembly and compute metrics $cpt = 0; warn "Look at genome list...\n" if $A_VERBOSE; while ($line = $fh_genomes->getline()) { if($line =~ /^#/) { next; } chomp($line); my @fields = split(/\t+/, $line); my ($species, $id, $url, $bioP, $group, $subGroup, $gSize, $modDate, $status, $center, $bioS) = ($fields[0], $fields[8], "", $fields[2], $fields[4], $fields[5], $fields[6], $fields[15], $fields[16], $fields[17], $fields[18]); if(defined $known_genomes{$id} && $known_genomes{$id}==1) { next; } my $report = "$url;$bioP;$group;$subGroup;$gSize;$modDate;$status;$center;$bioS"; my $odir = $A_OUTDIR."/".$id; system("mkdir $odir"); my $link_file = $odir."/".$id . ".url"; warn "*************** Download $id ($species) and compute metrics ***************\n" if $A_VERBOSE; warn "+++++++++++++++ Get URL +++++++++++++++\n" if $A_VERBOSE; system("curl \"https://www.ncbi.nlm.nih.gov/assembly/?term=$id&report=xml\" | tee $link_file.bckup | grep FtpPath_GenBank | sed \"s=<= =g\" | sed \"s=>= =g\" | awk '{ print \$2;}' > $link_file"); my $fh = new FileHandle($link_file, "r"); while (my $l = $fh->getline()) { chomp($l); $url=$l; } if($url eq "") { warn "@@@@@ Can't download the following genome assembly: $id ($species) @@@@@\n"; next; } warn "+++++++++++++++ Download Fasta +++++++++++++++\n" if $A_VERBOSE; my $dir = basename($url); system("curl $url/${dir}_genomic.fna.gz | gunzip > $odir/${dir}_genomic.fna"); warn "+++++++++++++++ Generate contigs +++++++++++++++\n" if $A_VERBOSE; system("scaf2contigs.pl -i $odir/${dir}_genomic.fna -o $odir/${dir}_genomic_contigs.fna"); warn "+++++++++++++++ Compute stats +++++++++++++++\n" if $A_VERBOSE; system("assemblyMetrics $odir/${dir}_genomic.fna > $odir/${dir}_genomic.stats"); system("assemblyMetrics $odir/${dir}_genomic_contigs.fna > $odir/${dir}_genomic_contigs.stats"); warn "+++++++++++++++ Generate report +++++++++++++++\n" if $A_VERBOSE; my($stats_seq, $stats_contigs) = ("", ""); system("cat $odir/${dir}_genomic.stats | sed \"s=(==\" | sed \"s=)==\" | awk '{ if(\$1==\"N50\") {n50=\$3; l50=\$5; } if(\$1==\"N80\") {n80=\$3; l80=\$5; } if(\$1==\"N90\") {n90=\$3; l90=\$5; } if(\$1==\"Assembly\") { size=\$3; nb=\$5; min=\$7; max=\$9; avg=\$11; } if(\$1==\"NumberOfN=\") { n=\$2; perc_n=\$3; }} END {OFS=\";\"; print size,nb,max,min,avg,n50,l50,n80,l80,n90,l90,n,perc_n; }' > $odir/${dir}_genomic.stats.oneline"); $fh = new FileHandle("$odir/${dir}_genomic.stats.oneline", "r"); while (my $ls = $fh->getline()) { chomp($ls); $stats_seq=$ls; } system("cat $odir/${dir}_genomic_contigs.stats | sed \"s=(==\" | sed \"s=)==\" | awk '{ if(\$1==\"N50\") {n50=\$3; l50=\$5; } if(\$1==\"N80\") {n80=\$3; l80=\$5; } if(\$1==\"N90\") {n90=\$3; l90=\$5; } if(\$1==\"Assembly\") { size=\$3; nb=\$5; min=\$7; max=\$9; avg=\$11; }} END {OFS=\";\"; print size,nb,max,min,avg,n50,l50,n80,l80,n90,l90; }' > $odir/${dir}_genomic_contigs.stats.oneline"); $fh = new FileHandle("$odir/${dir}_genomic_contigs.stats.oneline", "r"); while (my $lc = $fh->getline()) { chomp($lc); $stats_contigs=$lc; } $cpt++; my $fho = new FileHandle("$odir/assembly_stats", "w"); my $date = strftime "%Y/%m/%d", localtime; print $fho $species.";".$id.";".$stats_seq.";".$stats_contigs.";".$date.";".$url."\n"; print $fhreport $species.";".$id.";".$stats_seq.";".$stats_contigs.";".$date.";".$report."\n"; warn "+++++++++++++++ remove fasta file +++++++++++++++\n" if $A_VERBOSE; system("rm $odir/${dir}_genomic.fna $odir/${dir}_genomic_contigs.fna"); } warn "found $cpt new genomes and output is in $A_OUTDIR/assembly_stats.\n" if $A_VERBOSE; #system("rm -r $A_OUTDIR"); sub usage { my $usage = " $PRG_NAME - update genome list from NCBI and generate an html report. -list : file of genome list downloaded from NCBI ftp. Mandatory. ftp://ftp.ncbi.nlm.nih.gov/genomes/GENOME_REPORTS/eukaryotes.txt -prev : file with the previously downloaded genomes. Mandatory. -outdir : Output directory, default is assembly_stats_XXXX. -force : Force to use the specified output dir even if it already exists. -v : verbose mode -h : this help \n"; warn $usage; exit 1; }