236232f2b6c67b0ee2c39a9baa9480c9a500423f angie Tue Jun 3 09:27:37 2025 -0700 Updated naming conventions diff --git src/hg/utils/otto/fluA/updateAndersenLab.sh src/hg/utils/otto/fluA/updateAndersenLab.sh index 04719fd1738..0e06c87f1e1 100755 --- src/hg/utils/otto/fluA/updateAndersenLab.sh +++ src/hg/utils/otto/fluA/updateAndersenLab.sh @@ -17,52 +17,75 @@ cd /data/tmp/angie # Extract sequences that are in SRA but not (yet) in GenBank. find ~/github/avian-influenza/fasta -name \*.fa \ | grep -vFwf <(cut -f 1 genbank_mapping.tsv) \ | xargs cat > sraNotGb.fa faSize sraNotGb.fa | head -2 # Rename those sequences to look nicer in the tree and include some metadata. csvToTab < ~/github/avian-influenza/metadata/SraRunTable_automated.csv \ | tail -n+2 \ | cut -f 1,10,16,19,31 \ | perl -wne 'chomp; ($run, $date, $country, $host, $sample) = split(/\t/); $year = $date; $year =~ s/^(\d{4}).*/$1/; - $sample =~ s/-original$//; $sample =~ s/-repeat2?//; - $sample =~ s/([0-9]{2}-[0-9]{6}-[0-9]{3})-(300|MTM)/$1/; $host = ucfirst(lc($host)); $host =~ s/ /-/g; + $host =~ s/['"'"':;\[\]()]//g; + $country =~ s/:.*//; + if ($sample =~ m@^Influenza A virus (A/.*)/\d{4}\(H\dN\d\)\)?@) { + $sample = $1; + foreach $segment (qw/HA MP M2 NA NP NS PA PB1 PB2/) { + print "Consensus_${run}_${segment}_cns_threshold_0.5_quality_20\t${sample}_$segment/$year|$run|$date\n"; + print "Consensus_${run}_${segment}_cns_threshold_0.75_quality_20\t${sample}_$segment/$year|$run|$date\n"; + } + } else { + $sample =~ s/-original$//; $sample =~ s/-repeat2?//; + $sample =~ s/([0-9]{2}-[0-9]{6}-[0-9]{3})-(300|MTM)/$1/; foreach $segment (qw/HA MP M2 NA NP NS PA PB1 PB2/) { print "Consensus_${run}_${segment}_cns_threshold_0.5_quality_20\tA/$host/$country/${sample}_$segment/$year|$run|$date\n"; + print "Consensus_${run}_${segment}_cns_threshold_0.75_quality_20\tA/$host/$country/${sample}_$segment/$year|$run|$date\n"; + } }' \ | grep -Fwf <(grep ^\> sraNotGb.fa | sed -re 's/^>//;') \ > srr_renaming.tsv wc -l srr_renaming.tsv faRenameRecords sraNotGb.fa srr_renaming.tsv andersen_lab.srrNotGb.renamed.fa faSize andersen_lab.srrNotGb.renamed.fa | head -2 mv andersen_lab.srrNotGb.renamed.fa $fluADir/ # Format metadata for my build echo -e "strain\tgenbank_accession\tdate\tcountry\tlocation\tlength\thost\tbioproject_accession\tbiosample_accession\tsra_accession\tauthors\tpublications\tserotype\tsegment" \ > $fluADir/andersen_lab.srrNotGb.renamed.metadata.tsv csvToTab < ~/github/avian-influenza/metadata/SraRunTable_automated.csv \ | cut -f 1,5,6,9,10,16,18,19,31,33 \ | perl -wne 'chomp; ($run, $proj, $biosamp, $center, $date, $country, $loc, $host, $sample, $serotype) = split(/\t/); $year = $date; $year =~ s/^(\d{4}).*/$1/; - $sample =~ s/-original$//; $sample =~ s/-repeat2?//; - $sample =~ s/([0-9]{2}-[0-9]{6}-[0-9]{3})-(300|MTM)/$1/; $host = ucfirst(lc($host)); $host =~ s/ /-/g; + $host =~ s/['"'"':;\[\]()]//g; + $country =~ s/: ?(.*)//; + $loc =~ s/: /:/; + $loc = $1 if ($loc eq "" && $1 ne ""); + if ($sample =~ m@^Influenza A virus (A/.*)/\d{4}\(H\dN\d\)\)?@) { + $sample = $1; + foreach $segment (qw/HA MP M2 NA NP NS PA PB1 PB2/) { + print join("\t", "${sample}_$segment/$year|$run|$date", "", $date, $country, $loc, "", + $host, $proj, $biosamp, $run, $center, "", $serotype, $segment) . "\n"; + } + } else { + $sample =~ s/-original$//; $sample =~ s/-repeat2?//; + $sample =~ s/([0-9]{2}-[0-9]{6}-[0-9]{3})-(300|MTM)/$1/; foreach $segment (qw/HA MP M2 NA NP NS PA PB1 PB2/) { print join("\t", "A/$host/$country/${sample}_$segment/$year|$run|$date", "", $date, $country, $loc, "", $host, $proj, $biosamp, $run, $center, "", $serotype, $segment) . "\n"; + } }' \ | grep -Fwf <(grep ^\> $fluADir/andersen_lab.srrNotGb.renamed.fa | sed -re 's/^>//;') \ | sort -u \ >> $fluADir/andersen_lab.srrNotGb.renamed.metadata.tsv wc -l $fluADir/andersen_lab.srrNotGb.renamed.metadata.tsv echo done