#!/usr/bin/perl -w # Parse ICTVdb files using PERL's HTML::Parser # Parse specific files in desired order # use strict; use File::Find; use HTML::Parser (); #use HTML::Entities; # Common variables use vars qw(@parse_dirs $dir $filename $dirname $shortname $class $line_number $nid $name %order %family %subfamily %genus %species %strain %acronym %assesion %family_orig %order_orig %subfamily_orig %genus_orig %species_orig %strain_orig %acronym_orig %assesion_orig %synonym %synonym_orig %a_order %a_family %a_subfamily %a_genus %a_species %a_strain %a_synonym); # ICTV_Index.pl variables use vars qw($pos_ct $strain $f_numeric $initial_class); # ICTV_fstA.pl variables use vars qw($numeric @numeric $count $textual $VC ); # ICTV_Family variables use vars qw($area $sf_numeric @sf_numeric $sf_count $genus_numeric @genus_numeric $genus_count $sp_numeric @sp_numeric $sp_count $order_numeric $strain_name $strain_numeric $str_assesion_number $str_acronym $t $family_name $I_flag %family_files @family_files $family_file $family_index $key $value); # ICTV_Old_Family variables use vars qw($h3_flag @order $tmp_name $color); # ICTV_All output files use vars qw($order_out $family_out $subfamily_out $genus_out $species_out $strain_out $order_out_a $family_out_a $subfamily_out_a $genus_out_a $species_out_a $strain_out_a $acronym_out $synonym_out $assesion_number $acronym $vector); # Master list variables use vars qw($master_file $master_out $master_out_a @master $master_order $master_family $master_subfamily $master_genus $master_species $master); @parse_dirs = ('/home/flarson/phene.cpmc.columbia.edu/ICTVdb/Ictv','/home/flarson/phene.cpmc.columbia.edu/ICTVdb/ICTVdB'); $master_file = '/home/flarson/Master_Species_List-2008-2.csv'; $family_index = 'fs_index.htm'; # Index to family files %family_files = qw(fs_adeno.htm new fs_anell.htm new fs_arena.htm new fs_arter.htm new fs_ascov.htm new fs_asfar.htm new fs_astro.htm new fs_avsun.htm new fs_bacul.htm new fs_barna.htm new fs_benyv.htm new fs_birna.htm old fs_borna.htm old fs_bromo.htm new fs_bunya.htm new fs_calic.htm old fs_caudo.htm old fs_cauli.htm new fs_chera.htm new fs_chrys.htm old fs_circo.htm new fs_clost.htm new fs_comov.htm old fs_coron.htm new fs_corti.htm new fs_cysto.htm new fs_delta.htm old fs_dicis.htm new fs_endor.htm new fs_filov.htm old fs_flavi.htm new fs_flexi.htm new fs_furov.htm new fs_fusel.htm new fs_gemin.htm new fs_gutta.htm new fs_hepad.htm new fs_hepev.htm new fs_herpe.htm new fs_horde.htm new fs_hypov.htm new fs_idaeo.htm new fs_iflav.htm old fs_inovi.htm new fs_irido.htm new fs_leviv.htm new fs_lipot.htm new fs_luteo.htm old fs_marna.htm old fs_metav.htm new fs_micro.htm new fs_mimiv.htm new fs_monon.htm old fs_myovi.htm new fs_nanov.htm new fs_narna.htm old fs_nidov.htm old fs_nimav.htm new fs_nodav.htm old fs_ophio.htm new fs_ortho.htm new fs_ourmi.htm new fs_papil.htm new fs_param.htm old fs_parti.htm old fs_parvo.htm new fs_peclu.htm new fs_phyco.htm new fs_picor.htm new fs_plasm.htm new fs_podov.htm new fs_polyd.htm new fs_polyo.htm new fs_pomov.htm new fs_pospi.htm new fs_potyv.htm old fs_poxvi.htm new fs_prion.htm old fs_pseud.htm new fs_reovi.htm old fs_retro.htm new fs_rhabd.htm old fs_rhizi.htm new fs_roniv.htm new fs_rudiv.htm new fs_sadwa.htm new fs_salte.htm new fs_satel.htm old fs_sequi.htm new fs_sipho.htm new fs_sobem.htm new fs_tecti.htm new fs_tenui.htm new fs_tetra.htm new fs_tobam.htm new fs_tobra.htm new fs_togav.htm new fs_tombu.htm old fs_totiv.htm old fs_tymov.htm new fs_umbra.htm old fs_unass.htm new fs_varic.htm new fs_virga.htm new fs_viroi.htm new); $order_out = "/home/flarson/ICTV_Orders.txt"; $family_out = "/home/flarson/ICTV_Families.txt"; $subfamily_out = "/home/flarson/ICTV_Subfamilies.txt"; $genus_out = "/home/flarson/ICTV_Genuses.txt"; $species_out = "/home/flarson/ICTV_Species.txt"; $strain_out = "/home/flarson/ICTV_Strains.txt"; $master_out = "/home/flarson/ICTV_Master.txt"; $acronym_out = "/home/flarson/ICTV_Acronyms.txt"; $synonym_out = "/home/flarson/ICTV_Synonyms.txt"; $order_out_a = "/home/flarson/ICTV_Orders_Alph.txt"; $family_out_a = "/home/flarson/ICTV_Families_Alph.txt"; $subfamily_out_a = "/home/flarson/ICTV_Subfamilies_Alph.txt"; $genus_out_a = "/home/flarson/ICTV_Genuses_Alph.txt"; $species_out_a = "/home/flarson/ICTV_Species_Alph.txt"; $strain_out_a = "/home/flarson/ICTV_Strains_Alph.txt"; $master_out_a = "/home/flarson/ICTV_Master_Alph.txt"; # Fix some known problems before we start # $shortname = $line_number = 'bogus'; # Add_Species('02.054.0.01.010.','Pseudomonad phage gh-1'); # Fix # Add_Order('00.','Unassigned'); #fix # Add_Genus('00.038.0.04.','Deltalipothrixvirus'); #Add # Add_Species('00.038.0.04.001.','Acidianus filamentous virus 2'); #Add # Add_Species('00.083.0.01.003.','Acidianus rod-shaped virus 1'); #Add # Add_Family('00.113.','Ampullaviridae'); #Add # Add_Genus('00.113.0.01.','Ampullavirus'); #Add # Add_Species('00.113.0.01.001','Acidianus bottle-shaped virus'); #Add # Add_Family('00.117.','Globuloviridae'); #Add # Add_Genus('00.117.0.01.','Globulovirus'); #Add # Add_Species('00.117.0.01.001.','Pyrobaculum spherical virus'); #Add # Add_Species('00.117.0.01.002.','Thermoprotheus tenax spherical virus 1'); #Add # Add_Genus('00.058.1.84.','Cervidpoxvirus'); #Add # Add_Species('00.058.1.84.001.','Deerpox virus W-848-83'); #Add # Add_Genus('00.058.1.00.','Unassigned'); #Add # Add_Species('00.058.1.00.001.','Squirrel poxvirus'); #Add # Add_Species('00.058.2.02.013.',"Choristoneura fumiferana entomopoxvirus L"); # Fix # Add_Species('00.058.2.02.007.',"Chorizagrotis auxiliars entomopoxvirus 'L"); # Fix # Add_Species('00.058.2.02.014.',"Heliothis armigera entomopoxvirus L"); # Fix # Add_Genus('00.006.0.01.','Alphabaculovirus'); # Add_Genus('00.006.0.02.','Betabaculovirus'); # Add_Genus('00.006.0.03.','Gammabaculovirus'); # Add_Genus('00.006.0.04.','Deltabaculovirus'); # # Add_Genus('00.006.0.81.','Alphabaculovirus'); # # Add_Genus('00.006.0.82.','Betabaculovirus'); # Add_Species('00.006.0.01.001.','Autographa californica multiple nucleopolyhedrovirus'); # Add_Species('00.006.0.01.043.','Agrotis ipsilon multiple nucleopolyhedrovirus'); # # Add_Species('00.006.0.01.002','Adoxophyes honmai nucleopolyhedrovirus'); # Add_Species('00.006.0.01.002.','Anticarsia gemmatalis multiple nucleopolyhedrovirus'); # # Add_Species('00.006.0.01.005','Bombyx mori nucleopolyhedrovirus'); # # Add_Species('00.006.0.01.006','Buzura suppressaria nucleopolyhedrovirus'); # Add_Species('00.006.0.01.006.','Choristoneura fumiferana DEF multiple nucleopolyhedrovirus'); # Add_Species('00.006.0.01.005.','Choristoneura fumiferana multiple nucleopolyhedrovirus'); # # Add_Species('00.006.0.01.009','Choristoneura rosaceana nucleopolyhedrovirus'); # Add_Species('00.006.0.01.202.','Ectropis obliqua nucleopolyhedrovirus'); # # Add_Species('00.006.0.01.011','Epiphyas postvittana nucleopolyhedrovirus'); # # Add_Species('00.006.0.01.012','Helicoverpa armigera nucleopolyhedrovirus'); # # Add_Species('00.006.0.01.013','Helicoverpa zea single nucleopolyhedrovirus'); # Add_Species('00.006.0.01.008.','Lymantria dispar multiple nucleopolyhedrovirus'); # Add_Species('00.006.0.01.009.','Mamestra brassicae multiple nucleopolyhedrovirus'); # # Add_Species('00.006.0.01.016','Mamestra configurata nucleopolyhedrovirus A'); # # Add_Species('00.006.0.01.017','Mamestra configurata nucleopolyhedrovirus B'); # Add_Species('00.006.0.01.010.','Orgyia pseudotsugata multiple nucleopolyhedrovirus'); # Add_Species('00.006.0.01.013.','Spodoptera exigua multiple nucleopolyhedrovirus'); # Add_Species('00.006.0.01.014.','Spodoptera frugiperda multiple nucleopolyhedrovirus'); # # Add_Species('00.006.0.01.021','Spodoptera littoralis nucleopolyhedrovirus'); # # Add_Species('00.006.0.01.022','Spodoptera litura nucleopolyhedrovirus'); # # Add_Species('00.006.0.01.023','Thysanoplusia orichalcea nucleopolyhedrovirus'); # # Add_Species('00.006.0.01.024','Trichoplusia ni single nucleopolyhedrovirus'); # Add_Species('00.006.0.02.001.','Cydia pomonella granulovirus'); # Add_Species('00.006.0.03.001.','Neodiprion lecontei nucleopolyhedrovirus'); # Add_Species('00.006.0.03.002.','Neodiprion sertifer nucleopolyhedrovirus'); # Add_Species('00.006.0.04.001.','Culex nigripalpus nucleopolyhedrovirus'); # Add_Genus('00.103.0.01.','Whispovirus'); # Add_Species('00.103.0.01.001.','White spot syndrome virus'); # Add_Order('04.','Herpesvirales'); # Add_Family('04.001.','Herpesviridae'); # Add_Family('04.002.','Alloherpesviridae'); # Add_Family('04.003.','Malacoherpesviridae'); # Add_Subfamily('04.001.0.','Unassigned'); # Add_Subfamily('04.001.1.','Alphaherpesvirinae'); # Add_Subfamily('04.001.2.','Betaherpesvirinae'); # Add_Subfamily('04.001.3.','Gammaherpesvirinae'); # Add_Subfamily('04.002.0.','Unassigned'); # Add_Subfamily('04.003.0.','Unassigned'); # Add_Genus('04.002.0.01.','Ictalurivirus'); # Add_Genus('04.002.0.00.','Unassigned'); # Add_Species('04.002.0.00.001.','Cyprinid herpesvirus 3'); # Add_Genus('04.001.1.00.','Unassigned'); # Add_Genus('04.001.1.01.','Simplexvirus'); # Add_Genus('04.001.1.02.','Varicellovirus'); # Add_Genus('04.001.1.03.','Mardivirus'); # Add_Genus('04.001.1.04.','Iltovirus'); # Add_Genus('04.001.2.00.','Unassigned'); # Add_Genus('04.001.2.01.','Cytomegalovirus'); # Add_Genus('04.001.2.02.','Muromegalovirus'); # Add_Genus('04.001.2.03.','Roseolovirus'); # Add_Genus('04.001.2.04.','Proboscivirus'); # Add_Genus('04.001.3.00.','Unassigned'); # Add_Genus('04.001.3.01.','Lymphocryptovirus'); # Add_Genus('04.001.3.02.','Rhadinovirus'); # Add_Genus('04.001.3.03.','Macavirus'); # Add_Genus('04.001.3.04.','Percavirus'); # Add_Genus('04.001.0.00.','Unassigned'); # Add_Genus('04.003.0.00.','Unassigned'); # Add_Genus('04.003.0.01.','Ostreavirus'); # Add_Species('04.001.0.00.010.','Iguanid herpesvirus 2'); # Add_Species('04.003.0.01.011.','Ostreid herpesvirus 1'); # Add_Species('04.001.1.01.020.','Macacine herpesvirus 1'); # Add_Species('04.001.1.01.021.','Papiine herpesvirus 2'); # Add_Species('04.001.1.00.001.','Chelonid herpesvirus 5'); # Add_Species('04.001.1.00.002.','Chelonid herpesvirus 6'); # Add_Species('04.001.2.01.020.','Macacine herpesvirus 3'); # Add_Species('04.001.2.01.021.','Panine herpesvirus 2'); # Add_Species('04.001.3.01.010.','Gorilline herpesvirus 1'); # Add_Species('04.001.3.01.011.','Macacine herpesvirus 4'); # Add_Species('04.001.3.01.012.','Panine herpesvirus 1'); # Add_Species('04.001.3.01.013.','Papiine herpesvirus 1'); # Add_Species('04.001.3.01.014.','Bovine herpesvirus 6'); # Add_Species('04.001.3.01.015.','Caprine herpesvirus 2'); # Add_Species('04.001.3.01.016.','Macacine herpesvirus 5'); # Add_Species('04.001.3.00.001.','Phocid herpesvirus 2'); # Add_Species('00.001.0.81.025.','Simian adenovirus A'); # Add_Species('00.001.0.83.008.','Snake adenovirus A'); # Add_Species('00.099.0.02.014.','Human papillomavirus - cand90'); # Add_Species('00.099.0.02.012.','Rhesus monkey papillomavirus - 1'); # Add_Species('00.099.0.03.004.','Human papillomavirus - cand92'); # Add_Species('00.099.0.03.005.','Human papillomavirus - cand96'); # Add_Species('00.031.0.00.064.','Suid herpesvirus 3'); # Add_Species('00.031.0.00.065.','Suid herpesvirus 4'); # Add_Species('00.031.0.00.066.','Suid herpesvirus 5'); # Add_Species('04.001.3.02.019.','Saguinine herpesvirus 1'); # Add_Species('00.035.0.01.027.','Vibrio phage Vf12'); # Add_Species('00.029.0.02.009.','Horseradish curly top virus'); # Add_Species('00.029.0.02.010.','Pepper curly top virus'); # Add_Species('00.029.0.03.201.','Ageratum leaf curl virus'); # Add_Species('00.029.0.03.202.','Ageratum yellow vein Hualian virus'); # Add_Species('00.029.0.03.203.','Alternanthera yellow vein virus'); # Add_Species('00.029.0.03.204.','Bitter gourd yellow vein virus'); # Add_Species('00.029.0.03.205.','Boerhavia yellow spot virus'); # Add_Species('00.029.0.03.206.','Cabbage leaf curl Jamaica virus'); # Add_Species('00.029.0.03.207.','Clerodendron golden mosaic virus'); # Add_Species('00.029.0.03.212.','Corchorus golden mosaic virus'); # Add_Species('00.029.0.03.213.','Corchorus yellow spot virus'); # Add_Species('00.029.0.03.214.','Corchorus yellow vein Vietnam virus'); # Add_Species('00.029.0.03.215.','Cotton leaf curl Bangalore virus'); # Add_Species('00.029.0.03.216.','Desmodium leaf distortion virus'); # Add_Species('00.029.0.03.217.','Dicliptera yellow mottle virus'); # Add_Species('00.029.0.03.225.','East African cassava mosaic Kenya virus'); # Add_Species('00.029.0.03.226.','Erectites yellow mosaic virus'); # Add_Species('00.029.0.03.227.','Eupatorium yellow vein mosaic virus'); # Add_Species('00.029.0.03.228.','Euphorbia leaf curl Guangxi virus'); # Add_Species('00.029.0.03.240.','Dicliptera yellow mottle Cuba virus'); # Add_Species('00.029.0.03.241.','Honeysuckle yellow vein Kagoshima virus'); # Add_Species('00.029.0.03.242.','Horsegram yellow mosaic virus'); # Add_Species('00.029.0.03.243.','Kudzu mosaic virus'); # Add_Species('00.029.0.03.244.','Lindernia anagallis yellow vein virus'); # Add_Species('00.029.0.03.245.','Ludwigia yellow vein Vietnam virus'); # Add_Species('00.029.0.03.246.','Ludwigia yellow vein virus'); # Add_Species('00.029.0.03.247.','Malvastrum leaf curl Guangdong virus'); # Add_Species('00.029.0.03.248.','Malvastrum leaf curl virus'); # Add_Species('00.029.0.03.249.','Malvastrum yellow leaf curl virus'); # Add_Species('00.029.0.03.250.','Malvastrum yellow mosaic virus'); # Add_Species('00.029.0.03.251.','Malvastrum yellow vein Yunnan virus'); # Add_Species('00.029.0.03.252.','Mesta yellow vein mosaic virus'); # Add_Species('00.029.0.03.253.','Mimosa yellow leaf curl virus'); # Add_Species('00.029.0.03.254.','Okra yellow crinkle virus'); # Add_Species('00.029.0.03.260.','Okra yellow mosaic Mexico virus'); # Add_Species('00.029.0.03.261.','Okra yellow mottle Iguala virus'); # Add_Species('00.029.0.03.262.','Pedilenthus leaf curl virus'); # Add_Species('00.029.0.03.263.','Pepper leaf curl Lahore virus'); # Add_Species('00.029.0.03.264.','Pepper leaf curl Pakistan virus'); # Add_Species('00.029.0.03.265.','Pepper yellow leaf curl Indonesia virus'); # Add_Species('00.029.0.03.266.','Pepper yellow vein Mali virus'); # Add_Species('00.029.0.03.267.','Pumpkin yellow mosaic virus'); # Add_Species('00.029.0.03.268.','Radish leaf curl virus'); # Add_Species('00.029.0.03.269.','Rhynchosia golden mosaic Sinaloa virus'); # Add_Species('00.029.0.03.270.','Senecio yellow mosaic virus'); # Add_Species('00.029.0.03.274.','Sida leaf curl virus'); # Add_Species('00.029.0.03.275.','Sida micrantha mosaic virus'); # Add_Species('00.029.0.03.276.','Sida yellow mosaic Yucatan virus'); # Add_Species('00.029.0.03.277.','Sida yellow vein Madurai virus'); # Add_Species('00.029.0.03.278.','Sida yellow vein Vietnam virus'); # Add_Species('00.029.0.03.279.','Siegesbeckia yellow vein Guangxi virus'); # Add_Species('00.029.0.03.280.','Siegesbeckia yellow vein virus'); # Add_Species('00.029.0.03.281.','Soybean blistering mosaic virus'); # Add_Species('00.029.0.03.282.','Sweet potato leaf curl Canary virus'); # Add_Species('00.029.0.03.283.','Sweet potato leaf curl China virus'); # Add_Species('00.029.0.03.284.','Sweet potato leaf curl Lanzarote virus'); # Add_Species('00.029.0.03.285.','Sweet potato leaf curl Spain virus'); # Add_Species('00.029.0.03.286.','Tobacco leaf curl Cuba virus'); # Add_Species('00.029.0.03.287.','Tomato chino La Paz virus'); # Add_Species('00.029.0.03.288.','Tomato leaf curl Arusha virus'); # Add_Species('00.029.0.03.289.','Tomato leaf curl Comoros virus'); # Add_Species('00.029.0.03.290.','Tomato leaf curl Guangdong virus'); # Add_Species('00.029.0.03.291.','Tomato leaf curl Guangxi virus'); # Add_Species('00.029.0.03.292.','Tomato leaf curl Hsinchu virus'); # Add_Species('00.029.0.03.293.','Tomato leaf curl Java virus'); # Add_Species('00.029.0.03.294.','Tomato leaf curl Joydebpur virus'); # Add_Species('00.029.0.03.295.','Tomato leaf curl Kerala virus'); # Add_Species('00.029.0.03.296.','Tomato leaf curl Madagascar virus'); # Add_Species('00.029.0.03.297.','Tomato leaf curl Mali virus'); # Add_Species('00.029.0.03.298.','Tomato leaf curl Mayotte virus'); # Add_Species('00.029.0.03.299.','Tomato leaf curl Pakistan virus'); # Add_Species('00.029.0.03.300.','Tomato leaf curl Pune virus'); # Add_Species('00.029.0.03.301.','Tomato leaf curl Seychelles virus'); # Add_Species('00.029.0.03.302.','Tomato leaf curl Uganda virus'); # Add_Species('00.029.0.03.303.','Tomato mild yellow leaf curl Aragua virus'); # Add_Species('00.029.0.03.304.','Tomato mosaic leaf curl virus'); # Add_Species('00.029.0.03.305.','Tomato yellow leaf curl Axarquia virus'); # Add_Species('00.029.0.03.306.','Tomato yellow leaf curl Guangdong virus'); # Add_Species('00.029.0.03.307.','Tomato yellow leaf curl Indonesia virus'); # Add_Species('00.029.0.03.308.','Tomato yellow leaf curl Mali virus'); # Add_Species('00.029.0.03.309.','Tomato yellow leaf curl Vietnam virus'); # Add_Species('00.029.0.03.310.','Tomato yellow margin leaf curl virus'); # Add_Species('00.029.0.03.311.','Tomato yellow spot virus'); # Add_Species('00.029.0.03.312.','Tomato yellow vein streak virus'); # Add_Species('00.029.0.03.313.','Vernonia yellow vein virus'); # Add_Species('00.029.0.03.314.','Porcine circovirus-1'); # Add_Species('00.029.0.03.315.','Adeno-associated virus-1'); # Add_Species('00.029.0.03.316.','Adeno-associated virus-2'); # Add_Species('00.029.0.03.317.','Adeno-associated virus-3'); # Add_Species('00.029.0.03.318.','Adeno-associated virus-4'); # Add_Species('00.029.0.03.319.','Adeno-associated virus-5'); # Add_Species('00.029.0.03.320.','Sida yellow mosaic China virus'); # Add_Species('00.029.0.03.321.','Spilanthes yellow vein virus'); # #Add_Species('00.029.0.03.322.','Porcine circovirus-2'); # Add_Species('00.016.0.01.009.','Starling circovirus'); # Add_Species('00.015.0.05.023.','Spiraea yellow leaf spot virus'); # Add_Species('00.097.0.02.008.','Saccharomyces cerevisiae Ty5 virus'); # #Add_Species('00.061.1.02.005.','Finkel-Biskis-Jinkins murine sarcoma virus'); # #Add_Species('00.029.0.03.327.','Harvey murine sarcoma virus'); # #Add_Species('00.029.0.03.328.','Kirsten murine sarcoma virus'); # #Add_Species('00.029.0.03.329.','Moloney murine sarcoma virus'); # Add_Genus('00.060.0.13.','Cardoreovirus'); # Add_Species('00.060.0.13.001.','Eriocheir sinensis reovirus'); # Add_Genus('00.060.0.14.','Dinovernavirus'); # Add_Species('00.060.0.14.001.','Aedes pseudoscutellaris reovirus'); # Add_Genus('00.060.0.15.','Mimoreoviru'); # Add_Species('00.060.0.15.001.','Micromonas pusilla reovirus'); # #Add_Species('00.060.0.02.025.','St Croix River virus'); # Add_Species('00.060.0.05.007.','Aquareovirus G'); # Add_Species('00.060.0.11.335.','Idnoreovirus - 1'); # Add_Species('00.060.0.11.336.','Idnoreovirus - 2'); # Add_Species('00.060.0.11.337.','Idnoreovirus - 3'); # Add_Species('00.060.0.1.338.','Idnoreovirus - 4'); # Add_Species('00.060.0.11.339.','Idnoreovirus - 5'); # Add_Species('00.060.0.08.005.','Rice dwarf virus'); # Add_Family('00.118.','Picobirnaviridae'); # Add_Family('00.119.','Mimiviridae'); # Add_Genus('00.118.0.01','Picobirnavirus'); # Add_Genus('00.119.0.01','Mimivirus'); # Add_Species('00.118.0.01.001.','Human picobirnavirus'); # Add_Species('00.118.0.01.002.','Rabbit picobirnavirus'); # Add_Species('00.119.0.01.001.','Acanthamoeba polyphaga mimivirus'); # Add_Genus('00.000.0.01.','Polemovirus'); # Add_Genus('00.000.0.02.','Hepevirus'); # Add_Genus('00.000.0.03.','Tobamovirus'); # Add_Genus('00.000.0.04.','Tobravirus'); # Add_Genus('00.000.0.05.','Hordeivirus'); # Add_Genus('00.000.0.06.','Furovirus'); # Add_Genus('00.000.0.07.','Pomovirus'); # Add_Genus('00.000.0.08.','Pecluvirus'); # Add_Genus('00.000.0.09.','Benyvirus'); # Add_Species('00.000.0.01.001.','Poinsettia latent virus'); # Add_Species('00.000.0.02.001.','Hepatitis E virus'); # Add_Species('00.094.0.01.004.','Mirafiori lettuce big-vein virus'); # Add_Species('00.094.0.01.007.','Tulip mild mottle mosaic virus'); # Add_Species('00.011.0.01.027.','M-Poko virus'); # Add_Species('00.011.0.02.24.','Saaremaa virus'); # Add_Species('00.097.0.02.009.','Volvox carteri Lueckenbuesser virus'); # Add_Species('00.003.0.01.010.','Junan virus'); # Add_Species('00.037.0.02.001.','Enterobacteria phage Qbeta'); # Add_Species('00.000.2.00.040.','Solenopsis invicta virus-1'); # Add_Species('05.002.0.01.004.','Ectropis obliqua virus'); # Add_Species('05.002.0.01.005.','Varroa destructor virus-1'); # Add_Species('05.002.0.01.006.','Deformed wing virus'); # Add_Species('00.111.0.01.006.','Stocky prune virus'); # Add_Species('00.018.0.02.004.','Gentian mosaic virus'); # Add_Species('00.018.0.02.005.','Lamium mild mosaic virus'); # Add_Species('00.018.0.03.034.','Grapevine Anatolian ringspot virus'); # Add_Species('00.018.0.03.035.','Grapevine deformation virus'); # Add_Order('05.','Picornavirales'); # Add_Family('05.000.','Unassigned'); # Add_Family('05.001.','Picornaviridae'); # Add_Family('05.002.','Iflaviridae'); # Add_Family('05.003.','Dicistroviridae'); # Add_Family('05.004.','Marnaviridae'); # Add_Family('05.005.','Sequiviridae'); # Add_Family('05.006.','Comoviridae'); # Add_Genus('05.002.0.01.','Iflavirus'); # Add_Genus('05.000.0.01.','Sadwavirus'); # Add_Genus('05.000.0.02.','Cheravirus'); # Add_Species('00.057.0.01.128.','Basella rugose mosaic virus'); # Add_Species('00.057.0.01.129.','Chinese artichoke mosaic virus'); # Add_Species('00.057.0.01.130.','Daphne mosaic virus'); # Add_Species('00.057.0.01.131.','East Asian Passiflora virus'); # Add_Species('00.057.0.01.132.','Fritillary virus Y'); # Add_Species('00.057.0.01.133.','Kalanchoa mosaic virus'); # Add_Species('00.057.0.01.134.','Meadow saffron breaking virus'); # Add_Species('00.057.0.01.135.','Passiflora chlorosis virus'); # Add_Species('00.057.0.01.136.','Pennisetum mosaic virus'); # Add_Species('00.057.0.01.137.','Pfaffia mosaic virus'); # Add_Species('00.057.0.01.138.','Ranunculus leaf distortion virus'); # Add_Species('00.057.0.01.139.','Ranunculus mild mosaic virus'); # Add_Species('00.057.0.01.140.','Ranunculus mosaic virus'); # Add_Species('00.057.0.01.141.','Spiranthes mosaic virus 2'); # Add_Species('00.057.0.01.142.','Sweet potato virus 2'); # Add_Species('00.057.0.01.143.','Thunberg fritillary mosaic virus'); # Add_Species('00.057.0.01.144.','Tobacco etch virus'); # Add_Species('00.057.0.01.145.','Tobacco vein banding mosaic virus'); # Add_Species('00.057.0.01.146.','Tobacco vein mottling virus'); # Add_Species('00.057.0.01.147.','Tradescantia mild mosaic virus'); # Add_Species('00.057.0.01.148.','Tuberose mild mosaic virus'); # Add_Species('00.057.0.01.149.','Tuberose mild mottle virus'); # Add_Species('00.057.0.01.150.','Tulip breaking virus'); # Add_Species('00.057.0.01.151.','Tulip mosaic virus'); # Add_Species('00.057.0.01.152.','Turnip mosaic virus'); # Add_Species('00.057.0.01.153.','Zantedeschia mild mosaic virus'); # Add_Species('00.057.0.05.005.','Squash vein yellowing virus'); # Add_Species('00.057.0.04.003.','Ranunculus latent virus'); # Add_Species('00.057.0.00.009.','Tomato mild mottle virus'); # Add_Species('00.070.0.01.012.','Nudaurelia capensis beta virus'); # Add_Species('00.070.0.02.003.','Dendrolimus punctatus virus'); # Add_Species('00.070.0.01.004.','Nudaurelia capensis omega virus'); # Add_Species('00.039.0.01.001.','Barley yellow dwarf virus - PAV'); # Add_Species('00.039.0.01.002.','Barley yellow dwarf virus - MAV'); # Add_Species('00.039.0.01.003.','Barley yellow dwarf virus - PAS'); # Add_Species('00.039.0.02.009.','Cereal yellow dwarf virus - RPS'); # Add_Species('00.039.0.02.011.','Cereal yellow dwarf virus - RPV'); # Add_Species('00.039.0.02.012.','Turnip yellows virus'); # Add_Species('00.039.0.02.013.','Turnip yellows virus'); # Add_Species('00.039.0.00.006.','Barley yellow dwarf virus - GPV'); # Add_Species('00.039.0.00.020.','Barley yellow dwarf virus - RMV'); # Add_Species('00.039.0.00.021.','Tobacco necrotic dwarf virus'); # Add_Species('00.039.0.00.022.','Tobacco vein distorting virus'); # Add_Species('00.078.0.01.008.','Tobacco bushy top virus'); # Add_Species('00.078.0.01.009.','Tobacco mottle virus'); # Add_Species('00.074.0.01.015.','Cucumber Bulgarian virus'); # Add_Species('00.074.0.01.017.','Havel River virus'); # Add_Species('00.074.0.01.008.','Lato River virus'); # Add_Species('00.074.0.01.018.','Limonium flower distortion virus'); # Add_Species('00.074.0.01.010.','Neckar River virus'); # Add_Species('00.074.0.01.019.','Pelargonium necrotic spot virus'); # Add_Species('00.074.0.01.013.','Sitke waterborne virus'); # Add_Species('00.074.0.02.023.','Angelonia flower break virus'); # Add_Species('00.074.0.02.024.','Pea stem necrosis virus'); # Add_Species('00.074.0.02.025.','Turnip crinkle virus'); # Add_Species('00.074.0.03.010.','Olive mild mosaic virus'); # Add_Species('00.074.0.03.011.','Tobacco necrosis virus D'); # Add_Species('03.019.0.01.018.','Duck coronavirus'); # Add_Species('03.019.0.01.019.','Equine coronavirus'); # Add_Species('03.019.0.01.020.','Goose coronavirus'); # Add_Species('03.019.0.01.021.','Human coronavirus HKU1'); # Add_Species('03.019.0.01.022.','Human coronavirus NL63'); # Add_Species('03.019.0.01.023.','Pigeon coronavirus'); # Add_Species('03.004.0.01.004.','Porcine respiratory and reproductive syndrome virus'); # Add_Species('00.073.0.01.032.','Mosso das Pedras virus (78V3531)'); # Add_Species('00.073.0.01.019.','O-nyong-nyong virus'); # Add_Species('00.000.0.03.036.',"Sammons's Opuntia virus"); # Add_Species('00.010.0.02.009.','Fragaria chiloensis latent virus'); # Add_Species('00.077.0.01.019.','Anagyris vein yellowing virus'); # Add_Species('00.077.0.01.020.','Nemesia ring necrosis virus'); # Add_Species('00.077.0.01.021.','Scrophularia mottle virus'); # Add_Species('00.077.0.01.022.','Voandzeia necrotic mosaic virus'); # Add_Species('00.077.0.01.023.','Wild cucumber mosaic virus'); # Add_Species('00.077.0.02.005.','Citrus sudden death-associated virus'); # Add_Species('00.017.0.01.014.','Mint virus 1'); # Add_Species('00.017.0.02.011.','Blackberry yellow vein-associated virus'); # Add_Species('00.017.0.02.012.','Strawberry pallidosis-associated virus'); # Add_Species('00.017.0.00.031.','Mint vein banding-associated virus'); # Add_Species('00.056.0.01.031.','Alstroemeria virus X'); # Add_Species('00.056.0.01.032.','Mint virus X'); # Add_Species('00.056.0.01.033.','Opuntia virus X'); # Add_Species('00.056.0.01.034.','Schlumbergera virus X'); # Add_Species('00.056.0.01.035.','Zygocactus virus X'); # Add_Species('00.056.0.04.016.','Kalanchoa latent virus'); # Add_Species('00.056.0.04.041.','Melon yellowing-associated virus'); # Add_Species('00.056.0.04.042.','Narcissus symptomless virus'); # Add_Species('00.056.0.04.043.',"Sint-Jan's onion latent virus"); # Add_Species('00.056.0.04.044.','Sweet potato chlorotic fleck virus'); # Add_Species('00.056.0.05.009.','Grapevine rupestris stem pitting-associated virus'); # Add_Species('00.056.0.08.007.','Apricot pseudo-chlorotic leaf spot virus'); # Add_Species('80.001.0.01.010.','Tomato chlorotic dwarf viroid'); # Add_Species('80.001.0.01.011.','Citrus bark cracking viroid'); # Add_Species('80.001.0.04.011.','Citrus dwarfing viroid'); # Add_Genus('80.002.0.03','Elaviroid'); # Add_Species('80.002.0.03.001.','Citrus dwarfing viroid'); # Add_Genus('00.009.0.04','Blosnavirus'); # Add_Species('00.009.0.04.001.','Blotched snakehead virus'); # Add_Genus('00.056.0.09','Citrivirus'); # Add_Species('00.056.0.09.001','Citrus leaf blotch virus'); # undef($shortname); # undef($line_number); # Parse files in order of importance to getbest data first $shortname = 'fr-fst-a.htm'; Parse_fstA('/home/flarson/phene.cpmc.columbia.edu/Ictv/fr-fst-a.htm'); # Get good higher level info first while (($key,$value) = each (%family_files)) { #print "Key is $key\n"; $shortname = $key; if ($value eq 'new') { #print "$shortname\n"; Parse_Family("/home/flarson/phene.cpmc.columbia.edu/Ictv/$shortname"); } else { #print "$shortname\n"; #Parse_Old_Family("/home/flarson/ICTVdb/Ictv/$shortname"); } } foreach ('A' .. 'Z') { $shortname = "vn_indx".$_.".htm"; Parse_Index("/home/flarson/phene.cpmc.columbia.edu/Ictv/$shortname"); } # Parse the master list to compare #Parse_Master(); # Print it out and see what we have Print_Order(); Print_Family(); Print_Subfamily(); Print_Genus(); Print_Species(); Print_Strain(); #Print_Acronym(); Print_Synonym(); Print_Order_A(); Print_Family_A(); Print_Subfamily_A(); Print_Genus_A(); Print_Species_A(); Print_Strain_A(); ######################################################### # Parse and compare Master list to see if I got it all # ######################################################### sub Parse_Master { open(MAS, '<::encoding(iso-8859-1)',$master_file ) || die "Can not open $master_file"; open(MASO, '>::encoding(iso-8859-1)',$master_out ) || die "Can not open $master_out"; open(MASOA, '>::encoding(iso-8859-1)',$master_out_a ) || die "Can not open $master_out_a"; while() { undef(@master); undef($master_order); undef($master_family); undef($master_subfamily); undef($master_genus); undef($master_species); @master = split(/\,/,$_); chomp(@master); foreach (@master) { s/^\s+//; s/\s+$//; } next if $master[0] eq 'Order'; # print MASO "$master[4]\n"; # print MASO "@master\n"; my $master_order = $a_order{$master[0]}; my $master_family = $a_family{$master[1]}; my $master_subfamily = $a_subfamily{$master[2]}; my $master_genus = $a_genus{$master[3]}; my $master_species = $a_species{"$master[4]"}; #print MASO "$master[4]\n"; print MASO "$master_order:$master[0] " if ($master_order); print MASO "$master_family:$master[1] " if ($master_family); print MASO "$master_subfamily:$master[2] " if ($master_subfamily); print MASO "$master_genus:$master[3] " if ($master_genus); print MASO "$master_species:$master[4]\n" if ($master_species); #print MASO "$master[4]\n"; } close(MAS); close(MASO); close(MASOA); } ############################################ # Parse Old style fs_flavi.htm type files # ############################################ sub Parse_Old_Family { my $file = $_[0]; #print "$file\n"; open(OFAM, '<::encoding(iso-8859-1)',$file ) || die "Can not open $file"; # Use comments to break the file into pieces sub old_family_comment { my $comment = $_[0]; } # Get start tags for finer differentiation sub old_family_start { my ($event,$tokens,$text,$attr) = @_; foreach my $t (@$tokens) { if($t eq 'H3') { $h3_flag = 'true'; #print "found $t\n"; } elsif ($t eq 'DIV') { undef($h3_flag); undef($area); } elsif($t eq 'FONT') { if($$attr{'color'}) { $color = $$attr{'color'}; } } } } # Handle the textual content of the file sub old_family_text { my ($self, $text) = @_; $initial_class = 'NA'; chomp($text); $text = Fix_Text($text); if (($h3_flag) && ($text !~ /^\s*$/)) { # $text =~ s/^\s+//; #Strip leading and trailing whitespace # $text =~ s/\s+$//; #print "text is $text\n"; if ($text =~ /Taxonomic Structure of the Family/) { #print "in family loop\n"; $area = 'Family'; } elsif ($text =~ /Species, their serotypes, strains and isolates/) { $area = 'species'; } elsif ($text =~ /Tentative Species in the Genus/) { $area = 'tspecies'; } if ($area) { if ($area eq 'Family') { $text =~ s/ \ \; //xg; if ($text =~ /^\w+\s+\d{2}\./x) { #print "digital match $text\n"; @order = split(/\s/,$text); chomp(@order); #print "In Family(digital): $order[0]:$order[(@order - 1)]"; } elsif (@order) { #print "In Family(order): $order[0]:$order[(@order - 1)]:$text\n"; if ($order[0] =~ /^ Order $/xi) { Add_Order($order[(@order - 1)],$text); } elsif ($order[0] =~ /^ Family $/xi) { if ($text =~ /^\s*$/) { print "Misplaced   in $shortname:$line_number\n\n"; return;; } else { Add_Family($order[(@order - 1)],$text); #print "Adding family $order[(@order - 1)]:$text\n"; } } else { Add_Genus($order[(@order - 1)],$text); } undef(@order); undef($name); } else { if(($text =~ /\w+$/x) && (! $name)) { $name = $text; } elsif ($text =~ /^\d+\.\d*/) { $numeric = $text; } elsif (($name) && ($numeric) && ($text =~ /\w+$/x )) { if ($name =~/Genus/i) { Add_Genus($numeric,$text); } elsif ($name =~/Species/i) { Add_Species($numeric,$text); } #print "In Family(else):$name:$numeric:$text\n"; undef($name); undef($numeric); } } } if ($area eq 'species') { return if ( $text =~ /Species,/); if ($text =~ / \d{2}\.\d{3}\.\d\.\d{2}\.\d{3}\./x) { if (($numeric) && ($name)) { if(($assesion_number) && ($acronym)) { print "$numeric:$acronym:$name:$assesion_number\n"; if($numeric =~ /\d{2}\.\d{3}\.\d\.\d{2}\.\d{3}\.$/) { Add_Species($numeric,$name); } elsif ($numeric =~ /\d{2}\.\d{3}\.\d\.\d{2}\.\d{3}\.\d{2}\.\d{3}\.$/) { Add_Strain($numeric,$name); } } elsif ($assesion_number) { #print "$numeric:$name:$assesion_number\n"; if($numeric =~ /\d{2}\.\d{3}\.\d\.\d{2}\.\d{3}\.$/) { Add_Species($numeric,$name); } elsif ($numeric =~ /\d{2}\.\d{3}\.\d\.\d{2}\.\d{3}\.\d{2}\.\d{3}\.$/) { Add_Strain($numeric,$name); } } elsif ($acronym) { #print "$numeric:$acronym:$name\n"; if($numeric =~ /\d{2}\.\d{3}\.\d\.\d{2}\.\d{3}\.$/) { Add_Species($numeric,$name); } elsif ($numeric =~ /\d{2}\.\d{3}\.\d\.\d{2}\.\d{3}\.\d{2}\.\d{3}\.$/) { Add_Strain($numeric,$name); } } else { #print "$numeric:$name\n"; if($numeric =~ /\d{2}\.\d{3}\.\d\.\d{2}\.\d{3}\.$/) { Add_Species($numeric,$name); } elsif ($numeric =~ /\d{2}\.\d{3}\.\d\.\d{2}\.\d{3}\.\d{2}\.\d{3}\.$/) { Add_Strain($numeric,$name); } } undef($numeric); undef($name); undef($acronym); undef($assesion_number); $numeric = $text; } else { $numeric = $text; } } elsif($text =~ /^\[/) { $assesion_number = $text; #Add_Assesion($numeric,$assesion_number); } elsif($text =~ /^\(/) { $acronym = $text; $acronym =~ s/\(//; $acronym =~ s/\)//; #Add_Acronym($numeric,$name); } elsif($text =~ /^\{/) { if ($text =~ /^\{\w*:/) { print "found bad use of {} $shortname:$line_number\n"; $tmp_name = $text; } else { $vector = $text; $vector =~ s/\{//; $vector =~ s/\}//; } } elsif ($tmp_name) { $name = $tmp_name.$text; } else { $name = $text; } #print "In Species: $text\n"; } if ($area eq 'tspecies') { #print "In Tspecies: $text\n"; } } } } my $p = HTML::Parser->new( api_version => 3, comment_h => [\&old_family_comment, "text"], text_h => [\&old_family_text, "event,text"], start_h => [\&old_family_start, "event,tokens,text,attr"]); $line_number = 0; while () { $line_number++; $p->parse($_); } # flush and parse remaining unparsed HTML $p->eof; close(OFAM); } ####################### # Parse fr-fst-a.htm. # ####################### sub Parse_fstA { my $fstA_file = $_[0]; open(FSTA, '<:encoding(iso-8859-1)', $fstA_file) || die "Can not open $fstA_file"; $initial_class = 'NA'; # Handle test from HTML sub fstA_text { my ($event, $text) = @_; $text = Fix_Text($text); if ($text =~ /^VirusCode/ ) { # In area of interest #print "Starting\n"; $VC = 'true'; } elsif ($text eq 'Additional References') { # End of area of interest undef($VC); } # Parsing data of interest to us. elsif ($VC) { if ($text =~ /^[0-9][0-9]\./ ) { $numeric = $text; @numeric = split(/\./,$text); $count = @numeric; #print "count is $count\n"; } if (($text =~ /^[A-Z][a-z]/) && ($numeric)) { #print "Text is $text\n"; #$textual = $text; if ($count == 1) { Add_Order($numeric,$text); undef($numeric); #print "Adding order $numeric $text\n"; } if ($count == 2) { Add_Family($numeric,$text); undef($numeric); #print "Adding family $numeric $text from $shortname:$line_number\n"; } if ($count == 3) { Add_Subfamily($numeric,$text); undef($numeric); #print "$numeric $text\n"; } if ($count == 4) { Add_Genus($numeric,$text); undef($numeric); #print "$numeric $text\n"; } if ($count == 5) { Add_Species($numeric,$text); #print "$numeric $text\n"; undef($numeric); } if ($count == 6) { #Add_Species($numeric,$text); print "$numeric $text\n"; undef($numeric); } } } } my $p = HTML::Parser->new( api_version => 3, text_h => [\&fstA_text, "event,text"]); # parse line-by-line, rather than the whole file at once $line_number = 0; while () { $line_number++; $p->parse($_); } # flush and parse remaining unparsed HTML $p->eof; close(FSTA); undef($initial_class); } ######################################## # Parse files of the type vn_indxA.htm # ######################################## sub Parse_Index { undef($area); my $index_file = $_[0]; # $index_file = '/home/flarson/ICTVdb/Ictv/vn_indxA.htm'; #print "Reading from $index_file\n"; open(IDX, '<:encoding(iso-8859-1)',$index_file ) || die "Can not open $index_file"; # Parse Comments sub index_comment { my $comment = $_[0]; if ($comment eq '') { #print "$comment\n"; $area = 'VirusList'; $pos_ct = 0; } elsif ($comment eq '') { #print "$comment\n"; undef($area); } } # Parse start tags for selecting text sub index_start { my ($event,$tokens,$text,$attr) = @_; foreach my $t (@$tokens) { if( $t eq 'BR' ) { $pos_ct = 0; undef($strain); #print "Found BR\n"; } if ($t eq 'span') { $class = $$attr{'class'}; #print "$class\n"; } if (($area) && ($pos_ct == 0) && ($t eq 'A')) { $numeric = Fix_Text($$attr{'name'}); @numeric = split(/\./,$numeric); $count = @numeric; } } } # Handle each text field appropriately sub index_text { my ($event, $text) = @_; #print "$event\n"; $text = Fix_Text($text); if(($area) && ($class)) { #print "$class: $text: $numeric\n"; # Handle Strain level info if(($class eq'strain') && ($pos_ct == 0) && ($text !~ /^\s$/)) { $initial_class = $class; if($numeric =~ /^[0-9]{2}\.[0-9]{3}\.[0-9]\.[0-9]{2}\.[0-9]{3}$/ ) { print "Need trailing decimal at $numeric:$text:$shortname:$line_number\n\n"; $numeric = "$numeric"."."; } if($numeric =~ /^[0-9]{2}\.[0-9]{3}\.[0-9]\.[0-9]{2}\.[0-9]{3}\.$/ ) { print "Class is strain but should be species. $numeric:$text:$shortname:$line_number\n\n"; Add_Species($numeric,$text); } else { #print " In strain section Strain:$numeric:$text\n"; Add_Strain($numeric,$text); } $strain = 'true'; $sp_numeric = "$numeric[0].$numeric[1].$numeric[2].$numeric[3].$numeric[4]."; $genus_numeric = "$numeric[0].$numeric[1].$numeric[2].$numeric[3]."; $sf_numeric = "$numeric[0].$numeric[1].$numeric[2]."; $f_numeric = "$numeric[0].$numeric[1]."; $pos_ct++; } elsif ((($class eq 'sp') || ($class eq 'tsp')) && ($pos_ct == 0)&& ($text !~ /^\s*$/)) { #print "In sp section $numeric:$text\n"; $initial_class = $class; $sp_numeric = $numeric; if($sp_numeric =~ /^[0-9]{2}\.[0-9]{3}\.[0-9]\.[0-9]{2}\.[0-9]{3}$/ ) { print "Need trailing decimal at $sp_numeric:$text:$shortname:$line_number\n\n"; $sp_numeric = "$sp_numeric"."."; } if($sp_numeric !~ /^[0-9]{2}\.[0-9]{3}\.[0-9]\.[0-9]{2}\.[0-9]{3}\.$/ ) { print "Class is sp or tsp but should be strain. $sp_numeric:$text:$shortname:$line_number\n\n"; Add_Strain($sp_numeric,$text); } else { #print "In indx loop adding species $sp_numeric,$text\n"; Add_Species($sp_numeric,$text); } #print " Adding to Species:$sp_numeric:$text\n"; $genus_numeric = "$numeric[0].$numeric[1].$numeric[2].$numeric[3]."; #print "genus_numeric is $genus_numeric pos_ct is $pos_ct\n"; $sf_numeric = "$numeric[0].$numeric[1].$numeric[2]."; $f_numeric = "$numeric[0].$numeric[1]."; $pos_ct++; } elsif (($class eq 'syn') && ($pos_ct == 0)) { $initial_class = $class; Add_Synonym($numeric,$text); $sp_numeric = "$numeric[0].$numeric[1].$numeric[2].$numeric[3].$numeric[4]."; $genus_numeric = "$numeric[0].$numeric[1].$numeric[2].$numeric[3]."; $sf_numeric = "$numeric[0].$numeric[1].$numeric[2]."; $f_numeric = "$numeric[0].$numeric[1]."; $pos_ct++; # Handle Acronyms } elsif ($class eq 'acro') { $pos_ct++; if ($text !~ /^\s$/) { if ($initial_class eq 'strain') { Add_Acronym($numeric,$text); } else { Add_Acronym($sp_numeric,$text); } } # Handle species, including containing genus, subfamily and family } elsif (($class eq 'sp') || ($class eq 'tsp') || ($class eq 'syn')) { if ($text =~ /^\s*$/) { $pos_ct++; } elsif ($pos_ct == 2) { if($sp_numeric =~ /^[0-9][0-9]\.[0-9][0-9][0-9]\.[0-9]\.[0-9][0-9]\.[0-9][0-9][0-9]\.[0-9]/) { Add_Strain($sp_numeric,$text); #print "Faulty sp_numeric $sp_numeric\n"; } else { Add_Species($sp_numeric,$text); #print "pos+ct is $pos_ct Species:$sp_numeric:$text\n"; } $pos_ct++; } elsif ($pos_ct == 3) { Add_Genus($genus_numeric,$text); #print " Genus:$genus_numeric:$text\n"; $pos_ct++; } elsif ($pos_ct == 4) { Add_Subfamily($sf_numeric,$text); #print " Subfamily:$sf_numeric:$text\n"; $pos_ct++; } elsif ($pos_ct == 5) { Add_Family($f_numeric,$text); #print "Adding family $f_numeric $text from $shortname:$line_number\n"; } } undef($class); } } my $p = HTML::Parser->new( api_version => 3, comment_h => [\&index_comment, "text"], text_h => [\&index_text, "event,text"], start_h => [\&index_start, "event,tokens,text,attr"] ); # parse line-by-line, rather than the whole file at once $line_number = 0; while () { $line_number++; $p->parse($_); } # flush and parse remaining unparsed HTML $p->eof; close(IDX); } ######################################## # Parse files of the type fs_flavi.htm # ######################################## sub Parse_Family { my $file = $_[0]; #print "$file\n"; open(FAM, '<:encoding(iso-8859-1)',$file ) || die "Can not open $file"; # Use comments to break the file into pieces sub family_comment { my $comment = $_[0]; if ($comment eq '') { $area = 'NucAcidType'; } if ($comment eq '') { $area = 'TaxFamStruct'; undef($f_numeric); undef($order_numeric); undef($family_name); undef($class); undef($I_flag); } if ($comment eq '') { $area = 'SubFamilyName'; undef($sf_numeric); } if ($comment eq '') { $area = 'GenusName'; undef($genus_numeric); } if ($comment eq '') { $area = 'SpecDemarcCrit'; } if ($comment eq '') { $area = 'SpeciesList'; undef($numeric); undef(@numeric); #undef($count); undef($sp_numeric); undef(@sp_numeric); undef($sp_count); } if ($comment eq '') { undef($area); } if ($comment eq '') { $area = 'TentSpeciesList'; } if ($comment eq '') { undef($area); } #print "comment is $comment\n" } # Get start tags for finer differentiation sub family_start { my ($event,$tokens,$text,$attr) = @_; foreach my $t (@$tokens) { if($t eq 'i') { $I_flag = 'true'; } } if ($$attr{'class'}) { $class = $$attr{'class'}; #print "Class is $class\n"; } } # Handle the textual content of the file sub family_text { my ($self, $text) = @_; chomp($text); $text = Fix_Text($text); if (($area) && ($text)) { # if ($area eq 'NucAcidType') { # if($text !~ /^\s+?/s) { # print "$text\n"; # } # } # Handle Family level information if (($area eq 'TaxFamStruct') && ($class)) { if ($class eq 'F1') { #print "F1 text is $text\n"; if ($text =~ /^ [0-9][0-9]\.$/x ) { $order_numeric = $text; #print "Order numeric is $text\n"; } if ($text =~ /^ [0-9]{2}\.[0-9]{3}\.$/x ) { $f_numeric = $text; } elsif ($I_flag) { #print "Italic text is $text\n"; $family_name = $text; if ($order_numeric) { Add_Order($order_numeric,$family_name); #print "Order numeric $text used and reset\n"; undef($order_numeric); } if($f_numeric) { Add_Family($f_numeric,$family_name); #print "$f_numeric:$family_name\n"; undef($I_flag); undef($f_numeric); } } } } # Handle Subfamily level info elsif ($area eq 'SubFamilyName') { if ($text =~ /[0-9]/) { $text =~ s/^\s//; $sf_numeric = $text; @sf_numeric = split(/./,$text); $sf_count = @sf_numeric; } elsif (($sf_numeric) && ($text =~ /^[A-Z]+/)) { Add_Subfamily($sf_numeric,$text); #print "Subfamily:$sf_numeric:$text\n"; undef($sf_numeric); } } # Handle Genus level info elsif ($area eq 'GenusName') { if ($text =~ /[0-9]/) { $text =~ s/^\s//; $genus_numeric = $text; @genus_numeric = split(/./,$text); $genus_count = @genus_numeric; } elsif (($genus_numeric) && ($text =~ /^[A-Z]+/)) { #Add_Genus($genus_numeric,$text); #print "Genus:$genus_numeric:$text\n"; undef($genus_numeric); } } # Handle Species and strain level info elsif (($area eq 'SpeciesList') && ($class) && ($text !~ /^\s$/)) { if ($class eq 'vc') { @numeric = split(/\./,$text); $count = @numeric; #print "Count is $count\n"; if ($count == 5) { $sp_numeric = $text; #print "sp_numeric is $sp_numeric\n"; } elsif ($count > 5) { $strain_numeric = $text; #print "strain_numeric is $strain_numeric\n"; } undef($class); } elsif ($class eq 'sp') { $initial_class = $class; Add_Species($sp_numeric,$text); #print "Adding species $sp_numeric:$text from $shortname:$line_number\n"; undef($class); } elsif ($class eq 'strain') { $initial_class = $class; $strain_name = $text; Add_Strain($strain_numeric,$strain_name); #print "Adding strain $strain_numeric:$strain_name from $shortname:$line_number\n"; undef($class); } elsif ($class eq 'seq' ) { #print "Count is $count $text from $shortname:$line_number\n"; if($count == 5) { Add_Assesion($sp_numeric,$text); } elsif ($count > 5) { Add_Assesion($strain_numeric,$text); } #print "Strain assession no. is $str_assesion_number\n"; undef($class); } elsif ($class eq 'acro') { #print "Count is $count $text from $shortname:$line_number\n"; if($count == 5) { Add_Acronym($sp_numeric,$text); } elsif ($count > 5) { Add_Acronym($strain_numeric,$text); } undef($class); } } } } my $p = HTML::Parser->new( api_version => 3, comment_h => [\&family_comment, "text"], text_h => [\&family_text, "event,text"], start_h => [\&family_start, "event,tokens,text,attr"]); $line_number = 0; while () { $line_number++; $p->parse($_); } # flush and parse remaining unparsed HTML $p->eof; close(FAM); } ################################################################################### # Used to add info to the appropriate hash after checking to see if it disagree's # # with something that has been previously added. # ################################################################################### sub Add_Order { local ($nid,$name) = @_; if(! $nid) { print "In add order nid is $nid and name is $name from $shortname:$line_number\n"; } $name =~ s/unassigned|Not assigned|unclassified/Unassigned/; if($nid =~ /^[0-9]{2}\.[0-9]/) { print "Adding family/subfamily/genus/species/strain $nid:$name to subfamily from $shortname:$line_number\n" } if($order{$nid}) { if($order{$nid} ne $name) { if ($order{$nid} eq 'Unassigned') { $order{$nid} = $name; $order_orig{$nid} = "$shortname:$line_number"; if (! $a_order{$name}) { $a_order{$name} = $nid; } } elsif ($name ne 'Unassigned') { print "Order Mismatch!!!\n$nid:$order{$nid} from $order_orig{$nid} does not match\n$nid:$name from $shortname:$line_number\n"; } } } else { $order{$nid} = $name; $order_orig{$nid} = "$shortname:$line_number"; if (! $a_order{$name}) { $a_order{$name} = $nid; } } } sub Add_Family { local ($nid,$name) = @_; if(! $nid) { print "In add family nid is $nid and name is $name from $shortname:$line_number\n"; } $name =~ s/unassigned|Not assigned|unclassified/Unassigned/; if($nid =~ /^[0-9]{2}\.[0-9]{3}\.[0-9]/) { print "Adding subfamily/genus/species/strain $nid:$name to family from $shortname:$line_number\n" } if($family{$nid}) { if($family{$nid} ne $name) { if ($family{$nid} eq 'Unassigned') { $family{$nid} = $name; $family_orig{$nid} = "$shortname:$line_number"; if(! $a_family{$name}) { $a_family{$name} = $nid; } } elsif ($name ne 'Unassigned') { print "Family Mismatch!!! Initial class was $initial_class.\n"; print "$nid:$family{$nid} from $family_orig{$nid} does not match\n$nid:$name from $shortname:$line_number\n\n"; } } } else { $family{$nid} = $name; $family_orig{$nid} = "$shortname:$line_number"; if(! $a_family{$name}) { $a_family{$name} = $nid; } } } sub Add_Subfamily { local ($nid,$name) = @_; if(! $nid) { print "In add subfamily nid is $nid and name is $name from $shortname:$line_number\n"; } $name =~ s/unassigned|Not assigned|unclassified/Unassigned/; if($nid =~ /^[0-9]{2}\.[0-9]{3}\.[0-9]\.[0-9]/) { print "Adding genus/species/strain $nid:$name to subfamily from $shortname:$line_number\n" } if($subfamily{$nid}) { if($subfamily{$nid} ne $name) { if ($subfamily{$nid} eq 'Unassigned') { $subfamily{$nid} = $name; $subfamily_orig{$nid} = "$shortname:$line_number"; if(! $a_subfamily{$name}) { $a_subfamily{$name} = $nid; } } elsif ($name ne 'Unassigned') { print "Subfamily Mismatch!!! Initial class was $initial_class.\n"; print "$nid:$subfamily{$nid} from $subfamily_orig{$nid} does not match\n$nid:$name from $shortname:$line_number\n\n"; } } } else { $subfamily{$nid} = $name; $subfamily_orig{$nid} = "$shortname:$line_number"; if(! $a_subfamily{$name}) { $a_subfamily{$name} = $nid; } } } sub Add_Genus { local ($nid,$name) = @_; if(! $nid) { print "In add genus nid is $nid and name is $name from $shortname:$line_number\n"; } $name =~ s/unassigned|Not assigned|unclassified/Unassigned/; if($nid =~ /^[0-9]{2}\.[0-9]{3}\.[0-9]\.[0-9]{2}\.[0-9]/) { print "Adding species or strain $nid:$name to genus from $shortname:$line_number\n" } if($genus{$nid}) { if($genus{$nid} ne $name) { if ($genus{$nid} eq 'Unassigned') { $genus{$nid} = $name; $genus_orig{$nid} = "$shortname:$line_number"; if (! $a_genus{$name}) { $a_genus{$name} = $nid; } } elsif ($name ne 'Unassigned') { print "Genus Mismatch!!! Initial class was $initial_class, Line # $line_number.\n"; print "$nid:$genus{$nid} from $genus_orig{$nid} does not match\n$nid:$name from $shortname:$line_number\n\n"; } } } else { $genus{$nid} = $name; $genus_orig{$nid} = "$shortname:$line_number"; if (! $a_genus{$name}) { $a_genus{$name} = $nid; } } } sub Add_Species { local ($nid,$name) = @_; if(! $nid) { print "In add species nid is $nid and name is $name from $shortname:$line_number\n"; } $name =~ s/unassigned|Not assigned|unclassified/Unassigned/; if($nid =~ /^[0-9]{2}\.[0-9]{3}\.[0-9]\.[0-9]{2}\.[0-9]{3}\.[0-9]/) { print "Adding strain $nid:$name to species from $shortname:$line_number\n" } if($species{$nid}) { if($species{$nid} ne $name) { if ($species{$nid} eq 'Unassigned') { $species{$nid} = $name; $species_orig{$nid} = "$shortname:$line_number"; if (! $a_species{$name}) { $a_species{$name} = $nid; } } elsif ($name ne 'Unassigned') { print "Species Mismatch!!! Initial class was $initial_class.\n"; print "$nid:$species{$nid} from $species_orig{$nid} does not match\n$nid:$name from $shortname:$line_number\n\n"; } } } else { $species{$nid} = $name; $species_orig{$nid} = "$shortname:$line_number"; if (! $a_species{$name}) { $a_species{$name} = $nid; } } } sub Add_Strain { local ($nid,$name) = @_; if(! $nid) { print "In add strain nid is $nid and name is $name from $shortname:$line_number\n"; } $name =~ s/unassigned|Not assigned|unclassified/Unassigned/; if ($nid =~ /^[0-9]{2}\.[0-9]{3}\.[0-9]\.[0-9]{2}\.[0-9]{3}\.$/) { print "Adding species $nid:$name to strain from $shortname:$line_number\n"; } if($strain{$nid}) { if($strain{$nid} ne $name) { #print "Strain Mismatch!!! Initial class was $initial_class.\n"; #print "$nid:$strain{$nid} from $strain_orig{$nid} does not match\n$nid:$name from $shortname:$line_number\n\n"; } } else { $strain{$nid} = $name; $strain_orig{$nid} = "$shortname:$line_number"; if($name ne 'Unassigned') { $a_strain{$name} = $nid; } } } sub Add_Acronym { local ($nid,$name) = @_; if(! $nid) { print "In add acronym nid is $nid and name is $name from $shortname:$line_number\n"; } if($acronym{$nid}) { if($acronym{$nid} ne $name) { #print "Acronym Mismatch!!! Initial class was $initial_class.\n"; #print "$nid:$acronym{$nid} from $acronym_orig{$nid} does not match\n$nid:$name from $shortname:$line_number\n\n"; } } else { $acronym{$nid} = $name; $acronym_orig{$nid} = "$shortname:$line_number"; } } sub Add_Assesion { local ($nid,$name) = @_; if(! $nid) { print "In add assesion nid is $nid and name is $name from $shortname:$line_number\n"; } if($assesion{$nid}) { if($assesion{$nid} ne $name) { #print "Assesion Mismatch!!! Initial class was $initial_class.\n"; #print "$nid:$assesion{$nid} from $assesion_orig{$nid} does not match\n$nid:$name from $shortname:$line_number\n\n"; } } else { $assesion{$nid} = $name; $assesion_orig{$nid} = "$shortname:$line_number"; } } sub Add_Synonym { local ($nid,$name) = @_; if(! $nid) { print "In add synonym nid is $nid and name is $name from $shortname:$line_number\n"; } if($synonym{$nid}) { if($synonym{$nid} ne $name) { #print "Synonym Mismatch!!! Initial class was $initial_class.\n"; #print "$nid:$synonym{$nid} from $synonym_orig{$nid} does not match\n$nid:$name from $shortname:$line_number\n\n"; } } else { $synonym{$nid} = $name; $synonym_orig{$nid} = "$shortname:$line_number"; if($name ne 'Unassigned') { $a_synonym{$name} = $nid; } } } ########################################## # Subroutines to print resulting hashes # ########################################## sub Print_Order { #print "Printing Orders\n"; open(GRP,'>',"$order_out"); my @order = sort(keys(%order)); foreach my $order (@order) { print GRP "$order:$order{$order}\n"; } close(GRP); } sub Print_Order_A { #print "Printing Orders\n"; open(AGRP,'>',"$order_out_a"); my @a_order = sort(keys(%a_order)); foreach my $a_order (@a_order) { print AGRP "$a_order:$a_order{$a_order}\n"; } close(AGRP); } sub Print_Family { #print "Printing Families\n"; open(FAM,'>',"$family_out"); my @family = sort(keys(%family)); foreach my $family (@family) { print FAM "$family:$family{$family}\n"; } close(FAM); } sub Print_Family_A { #print "Printing Families\n"; open(AFAM,'>',"$family_out_a"); my @a_family = sort(keys(%a_family)); foreach my $a_family (@a_family) { print AFAM "$a_family:$a_family{$a_family}\n"; } close(AFAM); } sub Print_Subfamily { #print "Printing Subfamilies\n"; open(SFAM,'>',"$subfamily_out"); my @subfamily = sort(keys(%subfamily)); foreach my $subfamily (@subfamily) { print SFAM "$subfamily:$subfamily{$subfamily}\n"; } close(SFAM); } sub Print_Subfamily_A { #print "Printing Subfamilies\n"; open(ASFAM,'>',"$subfamily_out_a"); my @a_subfamily = sort(keys(%a_subfamily)); foreach my $a_subfamily (@a_subfamily) { print ASFAM "$a_subfamily:$a_subfamily{$a_subfamily}\n"; } close(ASFAM); } sub Print_Genus { #print "Printing Genuses\n"; open(GEN,'>',"$genus_out"); my @genus = sort(keys(%genus)); foreach my $genus (@genus) { print GEN "$genus:$genus{$genus}\n"; } close(GEN); } sub Print_Genus_A { #print "Printing Genuses\n"; open(AGEN,'>',"$genus_out_a"); my @a_genus = sort(keys(%a_genus)); foreach my $a_genus (@a_genus) { print AGEN "$a_genus:$a_genus{$a_genus}\n"; } close(AGEN); } sub Print_Species { #print "Printing Species\n"; my @species = sort(keys(%species)); open(SPC,'>',"$species_out"); foreach my $species (@species) { if(($acronym{$species}) && ($assesion{$species})) { print SPC "$species:$acronym{$species}:$species{$species}:$assesion{$species}\n"; } elsif($acronym{$species}) { print SPC "$species:$acronym{$species}:$species{$species}\n"; } elsif ($assesion{$species}) { print SPC "$species:$species{$species}:$assesion{$species}\n"; } else { print SPC "$species:$species{$species}\n"; } } close(SPC); } sub Print_Species_A { #print "Printing Species\n"; my @a_species = sort(keys(%a_species)); open(ASPC,'>',"$species_out_a"); foreach my $a_species (@a_species) { print ASPC "$a_species:$a_species{$a_species}\n"; } close(ASPC); } sub Print_Strain { my @strain = sort(keys(%strain)); open(STR,'>',"$strain_out"); foreach my $strain (@strain) { if (($acronym{$strain}) && ($assesion{$strain})) { print STR "$strain:$acronym{$strain}:$strain{$strain}:$assesion{$strain}\n"; } elsif($acronym{$strain}) { print STR "$strain:$acronym{$strain}:$strain{$strain}\n"; } elsif ($assesion{$strain}) { print STR "$strain:$strain{$strain}:$assesion{$strain}\n"; } else { print STR "$strain:$strain{$strain}\n"; } } close(STR); } sub Print_Strain_A { my @a_strain = sort(keys(%a_strain)); open(ASTR,'>',"$strain_out_a"); foreach my $a_strain (@a_strain) { print ASTR "$a_strain:$a_strain{$a_strain}\n"; } close(ASTR); } sub Print_Acronym { #print "Printing Acronyms\n"; open(ACR,'>',"$acronym_out"); my @acronym = sort(keys(%acronym)); foreach my $acronym (@acronym) { print ACR "$acronym:$acronym{$acronym}\n"; } close(ACR); } sub Print_Synonym { #print "Printing synonyms\n"; open(SYN,'>',"$synonym_out"); my @synonym = sort(keys(%synonym)); foreach my $synonym (@synonym) { print SYN "$synonym:$synonym{$synonym}\n"; } close(SYN); } sub Fix_Text { my $text = $_[0]; my $oldtext = $text; $text =~ s/\ \;|\;\ \;//g; $text =~ s/ \ \; //xg; $text =~ s/\;\ \;//g; $text =~ s/^\s+//; #Strip leading and trailing whitespace $text =~ s/\s+$//; $text =~ s/\α\;|\balpha/Alpha/g; $text =~ s/\ba10\b/Alpha10/g; $text =~ s/\ba3/Alpha3/g; $text =~ s/phage Alpha/phage alpha/g; $text =~ s/\β\;|\bbeta/Beta/g; $text =~ s/phage Beta/phage beta/g; $text =~ s/\Χ\;|\bchi/Chi/g; $text =~ s/phage Chi/phage chi/g; $text =~ s/\δ\;|\bdelta/Delta/g; $text =~ s/phage Delta/phage delta/g; $text =~ s/\ϕ\;|\Φ\;|\bphi/Phi/g; $text =~ s/phage Phi/phage phi/g; $text =~ s/\γ\;|\bgamma/Gamma/g; $text =~ s/phage gamma/phage gamma/g; $text =~ s/\λ\;|\blambda/Lambda/g; $text =~ s/phage Lambda/phage lambda/g; $text =~ s/\η\;|\beta/Eta/g; $text =~ s/phage Eta/phage eta/g; $text =~ s/\Ψ\;|\ψ\;|\bpsi/Psi/g; $text =~ s/phage Psi/phage psi/g; $text =~ s/\μ\;|\bmu/Mu/g; $text =~ s/phage Mu/phage mu/g; $text =~ s/phage m2/phage mu2/g; $text =~ s/\τ\;|\btau/Tau/g; $text =~ s/phage Tau/phage tau/g; $text =~ s/\ζ\;|\bzeta/Zeta/g; $text =~ s/phage Zeta/phage zeta/g; $text =~ s/\Ω\;|\ω\;|\bomega/Omega/g; $text =~ s/phage Omega/phage omega/g; $text =~ s/brachovirus/bracovirus/ig; $text =~ s/Allolevirus/Allolevivirus/ig; $text =~ s/Spbeta/SPBeta/g; $text =~ s/\á\;/á/g; $text =~ s/[Ss]abiá|sabia/Sabia/g; $text =~ s/fUW21/phiUW21/g; $text =~ s/fNS11/phiNS11/g; $text =~ s/fMH2K/phiMH2K/g; $text =~ s/fE125/phiE125/g; $text =~ s/fCd1/phiCd1/g; $text =~ s/f3626/phi3626/g; $text =~ s/1f1/1phi1/g; $text =~ s/1f3/1phi3/g; $text =~ s/1f7/1phi7/g; $text =~ s/1f9/1phi9/g; $text =~ s/d1/delta1/g; $text =~ s/df3/dPhi3/g; $text =~ s/df4/dPhi4/g; $text =~ s/df5/dPhi5/g; $text =~ s/f1.2/Phi1.2/g; $text =~ s/fA/phiA/g; $text =~ s/fK/phiK/g; $text =~ s/fR/phiR/g; $text =~ s/fX174/phiX174/g; $text =~ s/f6/phi6/g; $text =~ s/f7/phi7/g; $text =~ s/f8/phi8/g; $text =~ s/f9/phi9/g; $text =~ s/f10/phi10/g; $text =~ s/f11/phi11/g; $text =~ s/f12/phi12/g; $text =~ s/f13/phi13/g; $text =~ s/f14/phi14/g; $text =~ s/phage Ia\b/phage IAlpha/g; $text =~ s/philHa\b/PhilHAlpha/g; $text =~ s/phage t\b/ phage Tau/g; $text =~ s/Hyf30/HyPhi30/g; $text =~ s/z3/zeta3/g; $text =~ s/Mamalian/Mammalian/g; $text =~ s/\\226/-/g; $text =~ s/\\xD8/Phi/g; # $text =~ s/phiCd2/PhiCdelta1/g; $text =~ s/Ananindeua virus\s*\w*\s*BeAn/Ananindeua virus - BeAn/g; $text =~ s/Virgin River - SPAr/Virgin River virus - SPAr/g; $text =~ s/papillomavirus(\s.\s|\s|-)(\d+)/papillomavirus - $2/g; $text =~ s/papillomavirus\s.\scand/papillomavirus cand/g; $text =~ s/Caulobacter phage c\w*C/Caulobacter phage cphiC/g; $text =~ s/\[BETA\]/BETA/g; $text =~ s/Cercopithecine herpesvirus/Cercopithecine herpesvirus/g; $text =~ s/phage b$/phage beta/g; $text =~ s/\[Het-s\]/Het-s/g; $text =~ s/like Viruses/like viruses/g; $text =~ s/\[PIN\]/PIN/g; $text =~ s/\[PSI\]/PSI/g; $text =~ s/Salmon river/Salmon River/g; $text =~ s/FBT1/phiBT1/g; $text =~ s/fC31/phiC31/g; $text =~ s/\[URE3\]/URE3/g; $text =~ s/Chysochromulina/Chrysochromulina/g; $text =~ s/and9l/and91/g; $text =~ s/(\w)\s\s(\w)/$1 $2/g; # replace any double space with single space $text =~ s/Paran.\b/Parana/g; $text =~ s/Circopithecine/Cercopithecine/g; $text =~ s/SPO1/SP01/g; $text =~ s/Idnoreovirus -(\d)/Idnoreovirus - $1/g; $text =~ s/Betalipothrixvirus\*/Betalipothrixvirus/g; $text =~ s/bacteriophage phiC31/phage phiC31/g; $text =~ s/Vibrio phage phi149/Vibrio phage 149 \(type IV\)/g; $text =~ s/Pseudomonas phage gh-1/Pseudomonad phage gh-1/g; $text =~ s/Porcine circovirus - (\d)/Porcine circovirus-$1/g; $text =~ s/Hepatitis Delta virus/Hepatitis delta virus/g; $text =~ s/phage F1/phage FI/g; $text =~ s/Sacbrood Virus/Sacbrood virus/g; # $text =~ s/00\.031\.0\.00\./04\.002\.0\.00\./g; # $text =~ s/00\.031\.0\.01\./04\.002\.0\.01\./g; # $text =~ s/00\.031\.1\.01\./04\.001\.1\.01\./g; # $text =~ s/00\.031\.1\.02\.|00\.031\.1\.82\./04\.001\.1\.02\./g; # $text =~ s/00\.031\.1\.03\./04\.001\.1\.03\./g; # $text =~ s/00\.031\.1\.04\./04\.001\.1\.04\./g; # $text =~ s/00\.031\.2\.01\.|00\.031\.2\.81\./04\.001\.2\.01\./g; # $text =~ s/00\.031\.2\.02\./04\.001\.2\.02\./g; # $text =~ s/00\.031\.2\.03\./04\.001\.2\.03\./g; # $text =~ s/00\.031\.2\.04\./04\.001\.2\.04\./g; # $text =~ s/00\.031\.3\.01\./04\.001\.3\.01\./g; # $text =~ s/00\.031\.3\.02\.|00\.031\.3\.02\./04\.001\.3\.02\./g; # $text =~ s/00\.052\./05\.001\./g; # $text =~ s/00\.065\./05\.005\./g; # $text =~ s/00\.101\./05\.003\./g; # $text =~ s/00\.105\./05\.002\./g; # $text =~ s/00\.112\.0\.01|00\.112\.0\.81/05\.000\.0\.01/g; # $text =~ s/00\.018\./05\.006\./g; # $text =~ s/00\.084\.0\.01\.|00\.084\.0\.81\./00\.000\.0\.01\./g; # $text =~ s/00\.111\.0\.01/05\.000\.0\.02/g; # $text =~ s/00\.071\.0\.01\.|00\.071\.0\.81\./00\.000\.0\.03\./g; # $text =~ s/00\.072\.0\.01\./00\.000\.0\.04\./g; # $text =~ s/00\.032\.0\.01\./00\.000\.0\.05\./g; # $text =~ s/00\.027\.0\.01\./00\.000\.0\.06\./g; # $text =~ s/00\.086\.0\.01\./00\.000\.0\.07\./g; # $text =~ s/00\.087\.0\.01\./00\.000\.0\.08\./g; # $text =~ s/00\.088\.0\.01\.|00\.088\.0\.81\./00\.000\.0\.09\./g; #$text =~ s///g; # if ($text =~ /Paran.\b/ ) { # print "Found $text to match Paran\n"; # $text =~ s/Paran.\b/Parana/g; # } # if ($text eq ';') { # print "$oldtext became $text in $shortname at $line_number\n"; # } return($text); }