#! /usr/bin/perl # Master list variables $| = 1; use DBD::Pg; use DBI; use vars qw($master_file $master_out $master_out_a @master $master_order $master_family $master_subfamily $master_genus $master_species $master $dbmain $dbname $statement $conn $str $current_order $current_family $current_subfamily $current_genus $current_species); $dbname = 'bioseqdb'; $dbhost = '192.168.0.8'; $dbuser = ''; $dbpasswd = ''; $master_file = '/home/flarson/Master_Species_List-2008-2.csv'; $master_out = '/home/flarson/Master_vs_NCBI.txt'; $master_out_e = '/home/flarson/Master_vs_NCBI_errors.txt'; # Parse the master list to compare Parse_Master(); #print "connected to $dbname, state is $str\n"; #@row_ary = $conn->selectrow_array($statement); #print "@row_ary\n"; sub Parse_Master { open(MAS, '<::encoding(iso-8859-1)',$master_file ) || die "Can not open $master_file"; open(MASO, '>::encoding(iso-8859-1)',$master_out ) || die "Can not open $master_out"; open(MASE, '>::encoding(iso-8859-1)',$master_out_e ) || die "Can not open $master_out_e"; $conn = DBI->connect("dbi:Pg:dbname=$dbname;host=$dbhost;","$dbuser","$dbpasswd"); $str = $conn->state; while() { undef(@master); undef($master_order); undef($master_family); undef($master_subfamily); undef($master_genus); undef($master_species); @master = split(/\,/,$_); chomp(@master); foreach (@master) { s/^\s+//; s/\s+$//; } next if $master[0] eq 'Order'; # print MASO "$master[4]\n"; # print MASO "@master\n"; if($master[0] ne $current_order) { if ($order_isolates) { print MASO " $genus_isolates isolates found in genus $current_genus\n"; print MASO " $subfamily_isolates isolates found subfamily $current_subfamily\n"; print MASO " $family_isolates isolates found in family $current_family\n"; print MASO "$order_isolates isolates found in order $current_order\n"; undef($genus_isolates); undef($subfamily_isolates); undef($family_isolates); undef($order_isolates); } $current_order = $master[0]; $name = $master[0]; $statement = "select taxon_id,name,name_class from taxon_name where name like '$name'"; #print "$statement\n"; my @master_order = $conn->selectrow_array($statement); print MASO "Order:$master[0] Taxon_ID:$master_order[0] Name_class:$master_order[2]\n";# if ($master_order); if(($master[0] ne $master_order[1]) && ($master[0] ne 'Unassigned')) { print MASE "Order:$master[0]\n"; $ordercount++; $name1 = substr $name, 0, 1, '%'; $statement = "select taxon_id,name,name_class from taxon_name where name like '$name'"; my @guess_order = $conn->selectrow_array($statement); print MASE "Possible matches: @guess_order\n" if($guess_order[1]); } } if(($master[1] ne $current_family) && ($master[1] ne 'Unassigned')) { if ($family_isolates) { print MASO " $genus_isolates isolates found in genus $current_genus\n"; print MASO " $subfamily_isolates isolates found subfamily $current_subfamily\n"; print MASO " $family_isolates isolates found in family $current_family\n"; undef($genus_isolates); undef($subfamily_isolates); undef($family_isolates); } $current_family = $master[1]; $name = $master[1]; $statement = "select taxon_id,name,name_class from taxon_name where name like '$name'"; #print "$statement\n"; my @master_family = $conn->selectrow_array($statement); print MASO " Family:$master[1] Taxon_ID:$master_family[0] Name_class:$master_family[2]\n";# if ($master_family); if($master[1] ne $master_family[1]) { print MASE "Family:$master[1]\n" ; $familycount++; $name1 = substr $name, 0, 1, '%'; $statement = "select taxon_id,name,name_class from taxon_name where name like '$name'"; my @guess_family = $conn->selectrow_array($statement); print MASE "Possible matches: @guess_family\n" if($guess_family[1]); } } if(($master[2] ne $current_subfamily) && ($master[2] ne 'Unassigned')) { if ($subfamily_isolates) { print MASO " $genus_isolates isolates found in genus $current_genus\n"; print MASO " $subfamily_isolates isolates found subfamily $current_subfamily\n"; undef($genus_isolates); undef($subfamily_isolates); } $current_subfamily = $master[2]; $name = $master[2]; $statement = "select taxon_id,name,name_class from taxon_name where name like '$name'"; #print "$statement\n"; my @master_subfamily = $conn->selectrow_array($statement); print MASO " Subfamily:$master[2] Taxon_ID:$master_subfamily[0] Name_class:$master_subfamily[2]\n";# if ($master_subfamily); if($master[2] ne $master_subfamily[1]) { print MASE "Subfamily:$master[2]\n" ; $subfamilycount++; $name1 = substr $name, 0, 1, '%'; $statement = "select taxon_id,name from taxon_name where name like '$name'"; my @guess_subfamily = $conn->selectrow_array($statement); print MASE "Possible matches: @guess_subfamily\n" if($guess_subfamily[1]); } } if(($master[3] ne $current_genus) && ($master[3] ne 'Unassigned')) { if ($genus_isolates) { print MASO " $genus_isolates isolates found in genus $current_genus\n"; undef($genus_isolates); } $current_genus = $master[3]; $name = $master[3]; $statement = "select taxon_id,name,name_class from taxon_name where name like '$name'"; #print "$statement\n"; my @master_genus = $conn->selectrow_array($statement); print MASO " Genus:$master[3] Taxon_ID:$master_genus[0] Name_class:$master_genus[2]\n";# if ($master_genus); if($master[3] ne $master_genus[1]) { print MASE "Genus:$master[3]\n" ; $genuscount++; $name1 = substr $name, 0, 1, '%'; $statement = "select taxon_id,name,name_class from taxon_name where name like '$name'"; my @guess_genus = $conn->selectrow_array($statement); if($guess_genus[1]) { print MASE "Possible matches: @guess_genus\n" ; } elsif ($master[3] eq 'Omicronpapillomavirus') { $name2 = 'Omikronpapillomavirus'; $statement = "select taxon_id,name,name_class from taxon_name where name like '$name2'"; my @guess_genus = $conn->selectrow_array($statement); print MASE "Possible matches: @guess_genus\n" if($guess_genus[1]); } } } if(($master[4] ne $current_species) && ($master[4] ne 'Unassigned')) { $current_species = $master[4]; $name = $master[4]; $name =~ s/\'/\\'/g; $statement = "select taxon_id,name,name_class from taxon_name where name like '$name'"; #print "$statement\n"; my @master_species = $conn->selectrow_array($statement); undef($species_count); if($master_species[0]) { $species_taxid = $master_species[0]; $count_stmt = "select COUNT(DISTINCT(accession)) from bioentry join taxon_name using (taxon_id) where taxon_name.taxon_id='$species_taxid'"; $species_count = $conn->selectrow_array($count_stmt); #print "$master_species[0] $master_species[1]: $species_count\n"; $genus_isolates += $species_count; $subfamily_isolates += $species_count; $family_isolates += $species_count; $order_isolates += $species_count; $total_isolates += $species_count; } print MASO " Species:$master[4] Taxon_ID:$master_species[0] Name_class:$master_species[2] Isolates found:$species_count\n"; if($master[4] ne $master_species[1]) { print MASE "Species:$master[4]\n" ; $speciescount++; $name1 = substr $name, 0, 1, '%'; $statement = "select taxon_id,name,name_class from taxon_name where name like '$name'"; my @guess_species = $conn->selectrow_array($statement); print MASE "Possible matches: @guess_species\n" if($guess_species[1]); } } } print MASO " $genus_isolates isolates found in genus $current_genus\n"; print MASO " $subfamily_isolates isolates found subfamily $current_subfamily\n"; print MASO " $family_isolates isolates found in family $current_family\n"; print MASO "$order_isolates isolates found in order $current_order\n"; print MASO "ICTV orders not found in NCBI taxonomy: $ordercount\n"; print MASO "ICTV families not found in NCBI taxonomy: $familycount\n"; print MASO "ICTV subfamilies not found in NCBI taxonomy: $subfamilycount\n"; print MASO "ICTV genuses not found in NCBI taxonomy: $genuscount\n"; print MASO "ICTV species not found in NCBI taxonomy: $speciescount\n"; print MASO "Total isolate entries found in common taxonomy: $total_isolates\n"; print MASE "ICTV orders not found in NCBI taxonomy: $ordercount\n"; print MASE "ICTV families not found in NCBI taxonomy: $familycount\n"; print MASE "ICTV subfamilies not found in NCBI taxonomy: $subfamilycount\n"; print MASE "ICTV genuses not found in NCBI taxonomy: $genuscount\n"; print MASE "ICTV species not found in NCBI taxonomy: $speciescount\n"; print MASE "Total isolate entries found in common taxonomy: $total_isolates\n"; close(MAS); close(MASO); close(MASE); }