#!/bin/sh pg=`basename $0` TMP=/tmp/NCBI_assemblies.tmp/ WEB=/www/data/assemblies URL=ftp://ftp.ncbi.nlm.nih.gov/genomes/GENOME_REPORTS/eukaryotes.txt usage() { echo USAGE : echo " Generate assembly metrics of NCBI assemblies" echo " First argument is the keyword for assembly selection" exit 1 } NAME= len=0 while [ $1 ] do case $1 in "-h") usage ;; *) if [ "$NAME" = "" ] then NAME=$1 fi esac shift done if [ "$NAME" = "" ]; then echo "[Error] Need to provide a keyword for the assembly selection : Plants, Insects, Mammals, ..."; echo "See $pg -h for more details." exit 0; fi; echo "Create and move to temporary directory..." mkdir -p $TMP cd $TMP echo "Download assembly file from NCBI : $URL ..." curl $URL > $TMP/eukaryotes.txt cat $TMP/eukaryotes.txt | awk 'BEGIN { FS="\t"; } { OFS="\t"; gsub (" ", "", $6); print $0; }' | grep $NAME > $TMP/eukaryotes.$NAME.txt dos2unix $TMP/eukaryotes.$NAME.txt echo "Download new assemblies and compute stats..." if [ ! -f $TMP/$NAME.known ] then touch $TMP/$NAME.known fi assembly_stats.pl -list $TMP/eukaryotes.$NAME.txt -prev $TMP/$NAME.known -outdir $TMP/${NAME}_genomes -force dos2unix $TMP/${NAME}_genomes/assembly_stats echo "Generate json file..." cat $TMP/${NAME}_genomes/assembly_stats | grep -v \" | awk 'BEGIN { FS=";"; print "["; first=1; } $22!="" { if(!first) { print " ,"; } first=0; i=1; while(i<=NF) { if($i=="") { $i="-1"; } i++; } print " {"; print " \"organism\": \""$1"\","; print " \"assemblyID\": \""$2"\","; print " \"cumulativeSize\": "$3","; print " \"numberOfScaffolds\": "$4","; print " \"maxScaffoldSize\": "$5","; print " \"MinScaffoldSize\": "$6","; print " \"avgScaffoldSize\": "$7","; print " \"scaffoldN50\": "$8","; print " \"scaffoldL50\": "$9","; print " \"scaffoldN80\": "$10","; print " \"scaffoldL80\": "$11","; print " \"scaffoldN90\": "$12","; print " \"scaffoldL90\": "$13","; print " \"nbN\": "$14","; print " \"percentN\": "$15","; print " \"cumulativeSizeContig\": "$16","; print " \"numberOfContigs\": "$17","; print " \"maxContigSize\": "$18","; print " \"minContigSize\": "$19","; print " \"avgContigSize\": "$20","; print " \"contigN50\": "$21","; print " \"contigL50\": "$22","; print " \"contigN80\": "$23","; print " \"contigL80\": "$24","; print " \"contigN90\": "$25","; print " \"contigL90\": "$26","; print " \"updateDate\": \""$27"\","; print " \"url\": \""$28"\","; print " \"bioP\": \""$29"\","; print " \"group\": \""$30"\","; print " \"subGroup\": \""$31"\","; print " \"gSize\": "$32","; print " \"modifyDate\": \""$33"\","; print " \"status\": \""$34"\","; print " \"center\": \""$35"\","; print " \"bioS\": \""$36"\""; print " }"; } END { print "]"; }' | sed "s/%//" > $TMP/${NAME}_genomes/assembly_stats.json echo "Update files..." cp $TMP/${NAME}_genomes/assembly_stats.json $WEB/$NAME-assembly-stats.json cp $TMP/${NAME}_genomes/assembly_stats $WEB/${NAME}.known cp $TMP/${NAME}_genomes/assembly_stats $TMP/${NAME}.known