Difference between revisions of "OA and scripts for disease data"
Line 220: | Line 220: | ||
Field 11 Name:(dis_omimdb) Database for Disease Rel <br /> | Field 11 Name:(dis_omimdb) Database for Disease Rel <br /> | ||
Field 7 Name:(dis_species) Species <br /> | Field 7 Name:(dis_species) Species <br /> | ||
+ | |||
+ | ====Code annotation==== | ||
+ | For get_dis_disease_ace.pm | ||
+ | <pre style="white-space: pre-wrap; | ||
+ | white-space: -moz-pre-wrap; | ||
+ | white-space: -pre-wrap; | ||
+ | white-space: -o-pre-wrap; | ||
+ | word-wrap: break-word"> | ||
+ | |||
+ | package get_dis_disease_ace; | ||
+ | require Exporter; | ||
+ | |||
+ | |||
+ | our @ISA = qw(Exporter); | ||
+ | our @EXPORT = qw( getDisease ); | ||
+ | our $VERSION = 1.00; | ||
+ | |||
+ | # Dumper module to dump Ranjana's dis_ disease data. 2013 01 18 | ||
+ | |||
+ | |||
+ | |||
+ | use strict; | ||
+ | use diagnostics; | ||
+ | use LWP; | ||
+ | use LWP::Simple; | ||
+ | use DBI; | ||
+ | |||
+ | |||
+ | my $dbh = DBI->connect ( "dbi:Pg:dbname=testdb", "", "") or die "Cannot connect to database!\n";# connecting to postres dtabase | ||
+ | |||
+ | my $result; | ||
+ | |||
+ | my %theHash;# read all the tables from line 25 and store them in a hash; will store all postgres data to parse into .ace output | ||
+ | my @tables = qw( wbgene humandoid paperexpmod dbexpmod species diseaserelevance paperdisrel dbdisrel ); #list of postgrestables, as dis_wbgene, dis_humandoid, all begin wiht the prefix dis | ||
+ | |||
+ | |||
+ | my $all_entry = ''; #defining all the variables, .ace and the error text | ||
+ | my $err_text = ''; | ||
+ | |||
+ | my %nameToIDs; # type -> name -> ids -> count; maps WBGenes to PGids | ||
+ | my %ids; #just all the PGIDs that are relevant | ||
+ | |||
+ | |||
+ | |||
+ | my %dataType; | ||
+ | $dataType{humandoid} = 'multi'; | ||
+ | $dataType{paperexpmod} = 'multi'; | ||
+ | $dataType{paperdisrel} = 'multi'; | ||
+ | $dataType{dbexpmod} = 'comma'; | ||
+ | $dataType{dbdisrel} = 'comma'; | ||
+ | |||
+ | |||
+ | |||
+ | |||
+ | 1; | ||
+ | |||
+ | sub getDisease { | ||
+ | my ($flag) = shift; #use all or specify the geneID | ||
+ | |||
+ | if ( $flag eq 'all' ) { $result = $dbh->prepare( "SELECT * FROM dis_wbgene; " ); } # get all entries for type; # get all entries for all WBGenes | ||
+ | else { $result = $dbh->prepare( "SELECT * FROM dis_wbgene WHERE dis_wbgene = '$flag';" ); } # get all entries for type of object intid; #get all entries for WBGenes with the object name being the same as flag | ||
+ | $result->execute(); | ||
+ | while (my @row = $result->fetchrow) { $theHash{object}{$row[0]} = $row[1]; $nameToIDs{object}{$row[1]}{$row[0]}++; $ids{$row[0]}++; } # maps from WBGEne ids to PGids, so if a wbgene maps to multiple pgids the mappings are in %nameToIDs;tracks all the pgids for this query | ||
+ | my $ids = ''; my $qualifier = ''; | ||
+ | if ($flag ne 'all') { $ids = join"','", sort keys %ids; $qualifier = "WHERE joinkey IN ('$ids')"; } # for all of the tables lsited before, we will restrict it to those PGids, so if we're getting a specific query, we only want the data for that specific set of pgids | ||
+ | [1/23/13 3:35:35 PM] j chan: and we do that by adding to the postgres query the qualifier WHERE joinkey IN ('$ids') #query for all tables | ||
+ | foreach my $table (@tables) { #for each of those tables we will do this query, $theHash{$table}{$row[0]} = $row[1]; | ||
+ | $result = $dbh->prepare( "SELECT * FROM dis_$table $qualifier;" ); # get data for table with qualifier (or not if not) | ||
+ | $result->execute(); #query results stored in this hash, %theHash, the hash maps to DOID, $theHash{humandoid}{1} = 'DO:1234' | ||
+ | while (my @row = $result->fetchrow) { $theHash{$table}{$row[0]} = $row[1]; } | ||
+ | } # foreach my $table (@tables) | ||
+ | foreach my $objName (sort keys %{ $nameToIDs{object} }) {# getting each of the objects from the nameTOID hash | ||
+ | my $entry = ''; my $has_data; #storing the .ace entry for .ace object | ||
+ | $entry .= "\nGene : \"$objName\"\n"; #will dump empty gene objects, if no data present | ||
+ | |||
+ | foreach my $pgid (sort {$a<=>$b} keys %{ $nameToIDs{object}{$objName} }) { #for each PGID that has that object name the data will be dumped | ||
+ | my $species = ''; if ($theHash{species}{$pgid}) { $species = $theHash{species}{$pgid}; } #will get species value | ||
+ | my %omim = (); # filter OMIM results so no duplicates | ||
+ | if ($theHash{humandoid}{$pgid}) { #if human DOID | ||
+ | my (@doids) = $theHash{humandoid}{$pgid} =~ m/(DOID:\d+)/g;#match for DOID: numbers, DOID:\d+ | ||
+ | my @papers; | ||
+ | if ($theHash{paperexpmod}{$pgid}) { (@papers) = $theHash{paperexpmod}{$pgid} =~ m/(WBPaper\d+)/g; } #match for WBPaper, WBPaper\d+ | ||
+ | foreach my $doid (@doids) { # for each DOID | ||
+ | if (scalar @papers > 0) { foreach my $paper (@papers) { $entry .= qq(Experimental_model\t"$doid"\t"$species"\tPaper_evidence\t"$paper"\n); } } #there are papers,Experimental_model\t"$doid"\t"$species"\tPaper_evidence\t"$paper" | ||
+ | else { $entry .= qq(Experimental_model\t"$doid"\t"$species"\n); } } #there are no papers, Experimental_model\t"$doid"\t"$species" | ||
+ | if ($theHash{dbexpmod}{$pgid}) { my (@om) = $theHash{dbexpmod}{$pgid} =~ m/OMIM:(\d+)/g; foreach (@om) { $omim{$_}++; } } #if there is data in dis_dbexpmod, we are going to match for OMIM:(\d+),but only capture the number, not the OMIM:, store in the OMIM hash | ||
+ | } | ||
+ | if ($theHash{diseaserelevance}{$pgid}) { # if there is disease relevance, dis_diseaserelevance, convert '-->" for acedb, | ||
+ | my $disrel = $theHash{diseaserelevance}{$pgid}; if ($disrel =~ m/\'/) { $disrel =~ s/\'/''/g; } if ($disrel =~ m/\n/) { $disrel =~ s/\n/ /g; } #converts line breaks into spaces | ||
+ | my @papers; | ||
+ | if ($theHash{paperdisrel}{$pgid}) { (@papers) = $theHash{paperdisrel}{$pgid} =~ m/(WBPaper\d+)/g; } #same as line 73, matching for papers, for the table dis_paperdisrel | ||
+ | if (scalar @papers > 0) { foreach my $paper (@papers) { $entry .= qq(Disease_relevance\t"$disrel"\t"$species"\tPaper_evidence\t"$paper"\n); } } #same as 75 and 76, for disease relevance as opposed to DOID | ||
+ | else { $entry .= qq(Disease_relevance\t"$disrel"\t"$species"\n); } | ||
+ | if ($theHash{dbdisrel}{$pgid}) { my (@om) = $theHash{dbdisrel}{$pgid} =~ m/OMIM:(\d+)/g; foreach (@om) { $omim{$_}++; } } # for disease relevance as opposed to dbexpmod | ||
+ | } | ||
+ | foreach my $omim (sort keys %omim) { $entry .= qq(Database\t"OMIM"\t"disease"\t"$omim"\n); } #print all the unique OMIM IDs | ||
+ | if ($entry) { $has_data++; } # if .ace object has a phenotype, append to whole list | ||
+ | } # foreach my $pgid (sort {$a<=>$b} keys %{ $nameToIDs{$type}{$objName} }) | ||
+ | if ($has_data) { $all_entry .= $entry; } | ||
+ | } # foreach my $objName (sort keys %{ $nameToIDs{$type} }) | ||
+ | return( $all_entry, $err_text );# returns all entries, no error checking in place for now; | ||
+ | } # sub getDisease | ||
+ | |||
+ | __END__ | ||
+ | |||
+ | sub getData { | ||
+ | my ($cur_entry, $table, $joinkey, $tag, $objName, $goodGenes_ref) = @_; | ||
+ | if ($theHash{$table}{$joinkey}) { | ||
+ | my $data = $theHash{$table}{$joinkey}; | ||
+ | if ($data =~ m/^\"/) { $data =~ s/^\"//; } | ||
+ | if ($data =~ m/\"$/) { $data =~ s/\"$//; } | ||
+ | if ($data =~ m/ | ||
+ | /) { $data =~ s/ | ||
+ | //g; } | ||
+ | if ($data =~ m/\n/) { $data =~ s/\n/ /g; } | ||
+ | if ($data =~ m/^\s+/) { $data =~ s/^\s+//g; } if ($data =~ m/\s+$/) { $data =~ s/\s+$//g; } | ||
+ | my @data; | ||
+ | if ($data =~ m/\",\"/) { @data = split/\",\"/, $data; } | ||
+ | elsif ($pipeSplit{$table}) { @data = split/ \| /, $data; } | ||
+ | else { push @data, $data; } | ||
+ | foreach my $value (@data) { | ||
+ | if ($value =~ m/\"/) { $value =~ s/\"/\\\"/g; } | ||
+ | } # foreach my $value (@data) | ||
+ | } | ||
+ | return $cur_entry; | ||
+ | } # sub getData |
Revision as of 22:26, 24 January 2013
Contents
OA for disease data in WormBase
Fields
One gene can be attached to more than one Experimental_Model and one Disease_Relevance (and their related papers, databases and species); they will be grouped together in one instance of the Editor and grouped together in one line in the data-table. This is similar to a gene being attached to more than one GO term. If a gene needs to be attached to a unrelated disease, enter all data on a new line, by hitting 'New' in the OA.
Editor:
Field 1 Name: (dis_wbgene) WBGene
Behavior of field: Autocomplete obo
Source: WBGene obo
Similar to: WBGene in the GO OA or concise descrips OA
As one starts typing locus name, eg, lin-10 or cosmid name, eg., C09H6 script autocompletes and fills in WBGene ID.
Q: So single value, not multiple?
A: Single value.
Field 2 Name: (dis_curator) Curator
Behavior of field: Auto-complete drop-down with ready values
Similar to: Curator field in GO OA
Field 3 Name: (dis_curhistory) Curator History
Behavior of field: However it is in the concise OA; this is not something that can be changed manually.
Similar to: consise OA
Field 4 Name: (dis_humandoid) Experimental model for
Behavior:Autocomplete obo
Obo file to be used: DO_term obo
Source: https://diseaseontology.svn.sourceforge.net/svnroot/diseaseontology/trunk/HumanDO.obo
Similar to: GO term field in the GO OA.
For example, curator starts typing 'Alz', picks 'Alzheimer's disease' from the drop-down and script populates field with 'Alzheimer's disease (DOID:10652); similar to GO term OA in the GO OA.
Q:Updating: How do we update this obo file, how frequently do other obo files get updated?
A: Everyday at 8pm, if it has the proper .obo format it should be easy to
add to the cronjob that picks them up.
/home/postgres/work/pgpopulation/obo_oa_ontologies/update_obo_oa_ontologies.pl
Q: Single value / multivalue ?
A: Multiple value, as I may need to attach more than one DO term to a gene.
Field 5 Name: (dis_paperexpmod) Paper for Exp Mod
Obo file to be used: Paper obo
Behavior:Autocomplete obo
Obo file to be used: WBPaper obo
Similar to: The Paper field in the GO OA
Q: You mean the papers in the paper editor ?
The Paper obo, I guess they all come from the Paper Editor.
Q:Single/multi ?
Multi value.
Field 6 Name: (dis_dbexpmod) Database for Exp Mod
Behavior: Free text, multiple values comma-separated
Q: Will they dump in separate lines in the output ? Usually those are pipe-separated.
If they'll dump literally as pasted in, then commas are good.
A: Per latest conversation, using commas is fine, as long as there never will be a comma in the data itself, which is not likely to happen as these are OMIM IDs
Q:Do you want to use the same list as everyone else, and add new values to it (if they're okay with those values) ?
A: Yes, I just spoke to Daniela and adding the value 'Homo sapiens' is fine with her, if you want I can e-mail the group, but she felt adding needed values was fine.
Field 7 Name: (dis_species) Species
Behavior: Auto-complete drop-down with ready values
Similar to: Project field in the GO OA
Current values: Homo sapiens
Field 8 Name: (dis_lastupdateexpmod) Last Updated for Exp Model
Script autopopulates date when data is a New line, i.e when the "New" button is used.
Field 9 Name: (dis_diseaserelevance) Disease relevance
Behavior: Big Text box (big text-box, keeps expanding)
Similar To: 'Description Text' field in the Concise OA.
This is the Human_disease_relevance description (it appears as one of the drop-down values) for the'Description Type' field in the 'Concise' OA.
Change needed: Human_disease_relevance will not be entered via the concise OA. We can remove the 'Human_disease_relevance' from the 'Description Type' field in the OA.
Q:Do we start this OA by populating it from existing data in the GO OA ?
A:You mean 'existing data in the concise description OA, We can, if thats the way you want to start, or we can do it later.
Q: If so, let me know how to transfer the data.
A: So for any given 'Human disease relevance' description in the concise OA the transfer from Concise OA to Gene-disease OA is as follows:
WBGene-->WBGene
Curator-->Curator
Curator History-->Curator History
Description Text (Human Disease Relevance)-->Disease Relevance
Reference-->Reference under Disease Relevance
Accession Evidence-->OMIM Database
Last Updated-->Last updated
PGID-->PGID
Field 10 Name: (dis_paperdisrel) Paper for Disease Rel
Behavior: Autocomplete obo
Obo file to be used: WBPaper obo
Similar to: The Paper field in the GO OA
Q:So there's two papers fields. Are they both required, or it must have at least one, or nothing is required ?
A: Both are required.
Q:single/multi value ?
A: Multivalue
Field 11 Name: (dis_dbdisrel) Database for Disease Rel
Behavior: Free text, multiple values comma-separated
Q:Same as xref Database, but a different field ?
A: Exactly, again I will pipe-separate multiple values.
Field 12 Name: (dis_lastupdatedisrel) Last Updated for Disease Rel
Behavior: Script fills in current date if new annotation, if manually changing, entered as YYYY-MM-DD
Script autopopulates date when its a new data line.
Field 13 Name: dis_comment Comment
Behavior: Free text
Field 14 Name: pgid
Data constraints
For curators only at the tool level to check if required fields are filled.
These dis_ tables : wbgene curator humandoid paperexpmod species diseaserelevance paperdisrel lastupdatedisrel
WBGene
Curator
Experimental model for
Paper for Exp Mod
Species
Disease relevance
Paper for Disease Rel
Last Updated
To make live:
at : /home/postgres/work/pgpopulation/dis_disease/
create_dis_tables.pl -- create new postgres tables for dis_ disease OA
synchronize OA
transfer_concise_disease.pl -- take 95 entries that have con_desctype = 'Human_disease_relevance' and add them to dis_ tables starting with pgid 1.
Ranjana, manually delete the Human_disease_relevance entries from the concise OA.
remove the Human_disease_relevance option from the OA, resynchronize.
Dumper specifications
Dumper module in sandbox at /home/postgres/work/citace_upload/dis_disease/get_dis_disease_ace.pm Copy /home/postgres/work/citace_upload/dis_disease/use_package.pl to a directory you own and run it there.
Mapping between OA fields and acedb tags
Model:
?Gene DB_info Database ?Database ?Database_field Text Disease_info Experimental_model ?DO_term XREF Gene_by_biology ?Species #Evidence Potential_model ?DO_term XREF Gene_by_orthology ?Species #Evidence Disease_relevance ?Text ?Species #Evidence
We do not fill in Potential_model tag, Sanger does.
The example is lov-1 in the disease OA in the sandbox:
Model tag: ?Gene
Use value: WBGene (take ID only)
Eg: WBGene00003058
Model tag: DB_info Database ?Database ?Database_field Text
Use value(s) in 'xref Database' and in 'OMIM database'
Eg: OMIM:173900 and OMIM:601313, do not take OMIM:173900 again from 'OMIM database' since it is a duplicateof that in 'xref Database'.
.ace: Database "OMIM" "disease" "173900" Repeat line for each value if there are multiple values
Model tag: Experimental_model ?DO_term XREF Gene_by_biology ?Species #Evidence
Use value in 'Experimental Model for'
Eg:autosomal dominant polycystic kidney (DOID:5937); take ID only
Use value in 'Species' for ?Species
Eg: Homo sapiens
Use value(s) in 'Paper for Disease Rel' for #Evidence
Eg.WBPaper00038373
Repeat .ace line for every paper if multiple papers are present.
.ace:
Experimental_model DOID:5937 "Homo sapiens" Paper_evidence "WBPaper00038373"
Model tag: Disease_relevance ?Text ?Species #Evidence
Use value in 'Disease Relevance' for ?Text
Eg:lov-1 and pkd-2 encode the orthologs of human Polycystin-1 and Polycystin-2, which are mutated in autosomal dominant polycystic kidney disease; the polycystins regulate signaling involved in normal renal tubular structure and function; studies in the worm C. elegans have contributed extensively to the finding that cystic kidney diseases can be considered ciliopathies; in elegans lov-1 and pkd-2 are expressed in male ciliary neurons, are required for normal male mating behavior, do not seem to be required for ciliogenesis, and each polycystin may actually have a potential inhibitory function on the other for ciliary function; lov-1 and pkd-1 interact with a single-pass transmembrane protein, CWP-5, though the significance of this interaction for polycystic kidney disease is unknown.
Use value in 'Species' for ?Species
Eg. Homo sapiens
Use value in 'Paper for Disease Rel' for #Evidence
Eg: WBPaper00038373
.ace: Disease_relevance "lov-1 and pkd-2 encode the orthologs of human Polycystin-1 and Polycystin-2, which are mutated in autosomal dominant polycystic kidney disease; the polycystins regulate signaling involved in normal renal tubular structure and function; studies in the worm C. elegans have contributed extensively to the finding that cystic kidney diseases can be considered ciliopathies; in elegans lov-1 and pkd-2 are expressed in male ciliary neurons, are required for normal male mating behavior, do not seem to be required for ciliogenesis, and each polycystin may actually have a potential inhibitory function on the other for ciliary function; lov-1 and pkd-1 interact with a single-pass transmembrane protein, CWP-5, though the significance of this interaction for polycystic kidney disease is unknown." "Homo sapiens" Paper_evidence "WBPaper00038373" (Repeat this line for every paper, if multiple papers are present).
So put together, .ace file for lov-1 looks like:
Gene : "WBGene00003058" Database "OMIM" "disease" "173900" Database "OMIM" "disease" "601313" Experimental_model DOID:5937 "Homo sapiens" Paper_evidence "WBPaper00038373" Disease_relevance "lov-1 and pkd-2 encode the orthologs of human Polycystin-1 and Polycystin-2, which are mutated in autosomal dominant polycystic kidney disease; the polycystins regulate signaling involved in normal renal tubular structure and function; studies in the worm C. elegans have contributed extensively to the finding that cystic kidney diseases can be considered ciliopathies; in elegans lov-1 and pkd-2 are expressed in male ciliary neurons, are required for normal male mating behavior, do not seem to be required for ciliogenesis, and each polycystin may actually have a potential inhibitory function on the other for ciliary function; lov-1 and pkd-1 interact with a single-pass transmembrane protein, CWP-5, though the significance of this interaction for polycystic kidney disease is unknown." "Homo sapiens" Paper_evidence "WBPaper00038373"
When to dump data
If data is present in Field 4-- (dis_expmodelfor) Experimental model for, dump this field and the related fields:
Field 5 Name:(dis_paperexpmod) Paper for Exp Mod
Field 6 Name:(dis_xrefdb) Database for Exp Mod
Field 7 Name:(dis_species) Species
If data is present in Field 9 Name:(dis_diseaserelevance) Disease relevance, dump this and the related fields:
Field 10 Name:(dis_paperdisrel) Paper for Disease Rel
Field 11 Name:(dis_omimdb) Database for Disease Rel
Field 7 Name:(dis_species) Species
Code annotation
For get_dis_disease_ace.pm
package get_dis_disease_ace; require Exporter; our @ISA = qw(Exporter); our @EXPORT = qw( getDisease ); our $VERSION = 1.00; # Dumper module to dump Ranjana's dis_ disease data. 2013 01 18 use strict; use diagnostics; use LWP; use LWP::Simple; use DBI; my $dbh = DBI->connect ( "dbi:Pg:dbname=testdb", "", "") or die "Cannot connect to database!\n";# connecting to postres dtabase my $result; my %theHash;# read all the tables from line 25 and store them in a hash; will store all postgres data to parse into .ace output my @tables = qw( wbgene humandoid paperexpmod dbexpmod species diseaserelevance paperdisrel dbdisrel ); #list of postgrestables, as dis_wbgene, dis_humandoid, all begin wiht the prefix dis my $all_entry = ; #defining all the variables, .ace and the error text my $err_text = ; my %nameToIDs; # type -> name -> ids -> count; maps WBGenes to PGids my %ids; #just all the PGIDs that are relevant my %dataType; $dataType{humandoid} = 'multi'; $dataType{paperexpmod} = 'multi'; $dataType{paperdisrel} = 'multi'; $dataType{dbexpmod} = 'comma'; $dataType{dbdisrel} = 'comma'; 1; sub getDisease { my ($flag) = shift; #use all or specify the geneID if ( $flag eq 'all' ) { $result = $dbh->prepare( "SELECT * FROM dis_wbgene; " ); } # get all entries for type; # get all entries for all WBGenes else { $result = $dbh->prepare( "SELECT * FROM dis_wbgene WHERE dis_wbgene = '$flag';" ); } # get all entries for type of object intid; #get all entries for WBGenes with the object name being the same as flag $result->execute(); while (my @row = $result->fetchrow) { $theHash{object}{$row[0]} = $row[1]; $nameToIDs{object}{$row[1]}{$row[0]}++; $ids{$row[0]}++; } # maps from WBGEne ids to PGids, so if a wbgene maps to multiple pgids the mappings are in %nameToIDs;tracks all the pgids for this query my $ids = ; my $qualifier = ; if ($flag ne 'all') { $ids = join"','", sort keys %ids; $qualifier = "WHERE joinkey IN ('$ids')"; } # for all of the tables lsited before, we will restrict it to those PGids, so if we're getting a specific query, we only want the data for that specific set of pgids [1/23/13 3:35:35 PM] j chan: and we do that by adding to the postgres query the qualifier WHERE joinkey IN ('$ids') #query for all tables foreach my $table (@tables) { #for each of those tables we will do this query, $theHash{$table}{$row[0]} = $row[1]; $result = $dbh->prepare( "SELECT * FROM dis_$table $qualifier;" ); # get data for table with qualifier (or not if not) $result->execute(); #query results stored in this hash, %theHash, the hash maps to DOID, $theHash{humandoid}{1} = 'DO:1234' while (my @row = $result->fetchrow) { $theHash{$table}{$row[0]} = $row[1]; } } # foreach my $table (@tables) foreach my $objName (sort keys %{ $nameToIDs{object} }) {# getting each of the objects from the nameTOID hash my $entry = ; my $has_data; #storing the .ace entry for .ace object $entry .= "\nGene : \"$objName\"\n"; #will dump empty gene objects, if no data present foreach my $pgid (sort {$a<=>$b} keys %{ $nameToIDs{object}{$objName} }) { #for each PGID that has that object name the data will be dumped my $species = ; if ($theHash{species}{$pgid}) { $species = $theHash{species}{$pgid}; } #will get species value my %omim = (); # filter OMIM results so no duplicates if ($theHash{humandoid}{$pgid}) { #if human DOID my (@doids) = $theHash{humandoid}{$pgid} =~ m/(DOID:\d+)/g;#match for DOID: numbers, DOID:\d+ my @papers; if ($theHash{paperexpmod}{$pgid}) { (@papers) = $theHash{paperexpmod}{$pgid} =~ m/(WBPaper\d+)/g; } #match for WBPaper, WBPaper\d+ foreach my $doid (@doids) { # for each DOID if (scalar @papers > 0) { foreach my $paper (@papers) { $entry .= qq(Experimental_model\t"$doid"\t"$species"\tPaper_evidence\t"$paper"\n); } } #there are papers,Experimental_model\t"$doid"\t"$species"\tPaper_evidence\t"$paper" else { $entry .= qq(Experimental_model\t"$doid"\t"$species"\n); } } #there are no papers, Experimental_model\t"$doid"\t"$species" if ($theHash{dbexpmod}{$pgid}) { my (@om) = $theHash{dbexpmod}{$pgid} =~ m/OMIM:(\d+)/g; foreach (@om) { $omim{$_}++; } } #if there is data in dis_dbexpmod, we are going to match for OMIM:(\d+),but only capture the number, not the OMIM:, store in the OMIM hash } if ($theHash{diseaserelevance}{$pgid}) { # if there is disease relevance, dis_diseaserelevance, convert '-->" for acedb, my $disrel = $theHash{diseaserelevance}{$pgid}; if ($disrel =~ m/\'/) { $disrel =~ s/\'//g; } if ($disrel =~ m/\n/) { $disrel =~ s/\n/ /g; } #converts line breaks into spaces my @papers; if ($theHash{paperdisrel}{$pgid}) { (@papers) = $theHash{paperdisrel}{$pgid} =~ m/(WBPaper\d+)/g; } #same as line 73, matching for papers, for the table dis_paperdisrel if (scalar @papers > 0) { foreach my $paper (@papers) { $entry .= qq(Disease_relevance\t"$disrel"\t"$species"\tPaper_evidence\t"$paper"\n); } } #same as 75 and 76, for disease relevance as opposed to DOID else { $entry .= qq(Disease_relevance\t"$disrel"\t"$species"\n); } if ($theHash{dbdisrel}{$pgid}) { my (@om) = $theHash{dbdisrel}{$pgid} =~ m/OMIM:(\d+)/g; foreach (@om) { $omim{$_}++; } } # for disease relevance as opposed to dbexpmod } foreach my $omim (sort keys %omim) { $entry .= qq(Database\t"OMIM"\t"disease"\t"$omim"\n); } #print all the unique OMIM IDs if ($entry) { $has_data++; } # if .ace object has a phenotype, append to whole list } # foreach my $pgid (sort {$a<=>$b} keys %{ $nameToIDs{$type}{$objName} }) if ($has_data) { $all_entry .= $entry; } } # foreach my $objName (sort keys %{ $nameToIDs{$type} }) return( $all_entry, $err_text );# returns all entries, no error checking in place for now; } # sub getDisease __END__ sub getData { my ($cur_entry, $table, $joinkey, $tag, $objName, $goodGenes_ref) = @_; if ($theHash{$table}{$joinkey}) { my $data = $theHash{$table}{$joinkey}; if ($data =~ m/^\"/) { $data =~ s/^\"//; } if ($data =~ m/\"$/) { $data =~ s/\"$//; } if ($data =~ m/ /) { $data =~ s/ //g; } if ($data =~ m/\n/) { $data =~ s/\n/ /g; } if ($data =~ m/^\s+/) { $data =~ s/^\s+//g; } if ($data =~ m/\s+$/) { $data =~ s/\s+$//g; } my @data; if ($data =~ m/\",\"/) { @data = split/\",\"/, $data; } elsif ($pipeSplit{$table}) { @data = split/ \| /, $data; } else { push @data, $data; } foreach my $value (@data) { if ($value =~ m/\"/) { $value =~ s/\"/\\\"/g; } } # foreach my $value (@data) } return $cur_entry; } # sub getData