Annotation of Disease scripts

From WormBaseWiki
Jump to navigationJump to search

For get_dis_disease_ace.pm


package get_dis_disease_ace;
require Exporter;

our @ISA	= qw(Exporter);
our @EXPORT	= qw( getDisease );
our $VERSION	= 1.00;

# Dumper module to dump Ranjana's dis_ disease data.  2013 01 18

use strict;
use diagnostics;
use LWP;
use LWP::Simple;
use DBI;

my $dbh = DBI->connect ( "dbi:Pg:dbname=testdb", "", "") or die "Cannot connect to database!\n";# connecting to postres dtabase 

my $result;

my %theHash;# read all the tables from line 25 and store them in a hash; will store all postgres data to parse into .ace output
my @tables = qw( wbgene humandoid paperexpmod dbexpmod species diseaserelevance paperdisrel dbdisrel ); #list of postgrestables, as dis_wbgene, dis_humandoid, all begin wiht the prefix dis


my $all_entry = ''; #defining all the variables, .ace and the error text
my $err_text = '';

my %nameToIDs;							# type -> name -> ids -> count; maps WBGenes to PGids
my %ids;                                                        #just all the PGIDs that are relevant

my %deadObjects;                   #hash of all the dead objects

my %dataType;
$dataType{humandoid}   = 'multi';
$dataType{paperexpmod} = 'multi';
$dataType{paperdisrel} = 'multi';
$dataType{dbexpmod}    = 'comma';
$dataType{dbdisrel}    = 'comma';

1;

sub populateDeadObjects {
  $result = $dbh->prepare( "SELECT * FROM pap_status WHERE pap_status = 'invalid';" ); $result->execute();
  while (my @row = $result->fetchrow) { $deadObjects{paper}{invalid}{"WBPaper$row[0]"} = $row[1]; }
  $result = $dbh->prepare( "SELECT * FROM gin_dead;" ); $result->execute();
  while (my @row = $result->fetchrow) {                 # Ranjana doesn't care about hierarchy, just show her an error message
    if ($row[1]) { $deadObjects{gene}{"WBGene$row[0]"} = $row[1]; } }
} # sub populateDeadObjects    # we are getting the genes and the papers that are invalid, storing them in the dead objects hash


sub getDisease {
  my ($flag) = shift; #use all or specify the geneID

  if ( $flag eq 'all' ) { $result = $dbh->prepare( "SELECT * FROM dis_wbgene; " ); }		# get all entries for type; # get all entries for all WBGenes
    else { $result = $dbh->prepare( "SELECT * FROM dis_wbgene WHERE dis_wbgene = '$flag';" ); }	# get all entries for type of object intid; #get all entries for WBGenes with the object name being the same as flag
  
     $result->execute();
  while (my @row = $result->fetchrow) {
    if ($deadObjects{gene}{$row[1]}) { $err_text .= "pgid $row[0] has $row[1] which is $deadObjects{gene}{$row[1]}\n"; }        # add dead wbgenes to error out
      else { $theHash{object}{$row[0]} = $row[1]; $nameToIDs{object}{$row[1]}{$row[0]}++; $ids{$row[0]}++; } }          # add non-dead genes to hashes
  my $ids = ''; my $qualifier = '';  #now we are checking for dead genes, if dead, gives an error message, if not it is doing what it was doing before, that is dumping.
  
if ($flag ne 'all') { $ids = join"','", sort keys %ids; $qualifier = "WHERE joinkey IN ('$ids')"; } # for all of the tables lsited before, we will restrict it to those PGids, so if we're getting a specific query, we only want the data for that specific set of pgids
[1/23/13 3:35:35 PM] j chan: and we do that by adding to the postgres query the qualifier WHERE joinkey IN ('$ids') #query for all tables
  foreach my $table (@tables) { #for each of those tables we will do this query, $theHash{$table}{$row[0]} = $row[1];
    $result = $dbh->prepare( "SELECT * FROM dis_$table $qualifier;" );		# get data for table with qualifier (or not if not)
    $result->execute();	#query results stored in this hash, %theHash, the hash maps to DOID, $theHash{humandoid}{1} = 'DO:1234'
    while (my @row = $result->fetchrow) { $theHash{$table}{$row[0]} = $row[1]; }
  } # foreach my $table (@tables)
 foreach my $objName (sort keys %{ $nameToIDs{object} }) {# getting each of the objects from the nameTOID hash 
    my $entry = ''; my $has_data; #storing the .ace entry for .ace object
    $entry .= "\nGene : \"$objName\"\n"; #will dump empty gene objects, if no data present

    foreach my $pgid (sort {$a<=>$b} keys %{ $nameToIDs{object}{$objName} }) { #for each PGID that has that object name the data will be dumped
      my $species = ''; if ($theHash{species}{$pgid}) { $species = $theHash{species}{$pgid}; } #will get species value
      my %omim = (); # filter OMIM results so no duplicates
      if ($theHash{humandoid}{$pgid}) { #if human DOID
        my (@doids) = $theHash{humandoid}{$pgid} =~ m/(DOID:\d+)/g;#match for DOID: numbers, DOID:\d+
        my @papers;
        if ($theHash{paperexpmod}{$pgid}) { (@papers) = $theHash{paperexpmod}{$pgid} =~ m/(WBPaper\d+)/g; } #match for WBPaper, WBPaper\d+
        foreach my $doid (@doids) { # for each DOID
          if (scalar @papers > 0) { foreach my $paper (@papers) { $entry .= qq(Experimental_model\t"$doid"\t"$species"\tPaper_evidence\t"$paper"\n); } } #there are papers,Experimental_model\t"$doid"\t"$species"\tPaper_evidence\t"$paper"
            else { $entry .= qq(Experimental_model\t"$doid"\t"$species"\n); } } #there are no papers, Experimental_model\t"$doid"\t"$species"
        if ($theHash{dbexpmod}{$pgid}) { my (@om) = $theHash{dbexpmod}{$pgid} =~ m/OMIM:(\d+)/g; foreach (@om) { $omim{$_}++; } } #if there is data in dis_dbexpmod, we are going to match for OMIM:(\d+),but only capture the number, not the OMIM:, store in the OMIM hash
      }
      if ($theHash{diseaserelevance}{$pgid}) { # if there is disease relevance, dis_diseaserelevance, convert '-->" for acedb, 
        my $disrel = $theHash{diseaserelevance}{$pgid}; if ($disrel =~ m/\'/) { $disrel =~ s/\'/''/g; } if ($disrel =~ m/\n/) {  $disrel =~ s/\n/ /g; } #converts line breaks into spaces
        
 my @papers; my @all_papers;
        if ($theHash{paperexpmod}{$pgid}) { (@all_papers) = $theHash{paperexpmod}{$pgid} =~ m/(WBPaper\d+)/g; }
        foreach my $paper (@all_papers) {                       # get all papers and send error message for invalid papers, and add valid to list of papers
          if ($deadObjects{paper}{invalid}{$paper}) { $err_text .= "pgid $pgid has invalid paper $paper\n"; }
            else { push @papers, $paper; } }
my @papers; my @all_papers;
        if ($theHash{paperdisrel}{$pgid}) { (@all_papers) = $theHash{paperdisrel}{$pgid} =~ m/(WBPaper\d+)/g; }
        foreach my $paper (@all_papers) {                       # get all papers and send error message for invalid papers, and add valid to list of papers
          if ($deadObjects{paper}{invalid}{$paper}) { $err_text .= "pgid $pgid has invalid paper $paper\n"; }
            else { push @papers, $paper; } }
        
     if ($theHash{paperdisrel}{$pgid}) { (@papers) = $theHash{paperdisrel}{$pgid} =~ m/(WBPaper\d+)/g; } #same as line 73, matching for papers, for the table dis_paperdisrel
        if (scalar @papers > 0) { foreach my $paper (@papers) { $entry .= qq(Disease_relevance\t"$disrel"\t"$species"\tPaper_evidence\t"$paper"\n); } } #same as 75 and 76, for disease relevance as opposed to DOID
          else { $entry .= qq(Disease_relevance\t"$disrel"\t"$species"\n); }
        if ($theHash{dbdisrel}{$pgid}) { my (@om) = $theHash{dbdisrel}{$pgid} =~ m/OMIM:(\d+)/g; foreach (@om) { $omim{$_}++; } } # for disease relevance as opposed to dbexpmod
      }
      foreach my $omim (sort keys %omim) { $entry .= qq(Database\t"OMIM"\t"disease"\t"$omim"\n); } #print all the unique OMIM IDs
      if ($entry) { $has_data++; }                  # if .ace object has a phenotype, append to whole list
    } # foreach my $pgid (sort {$a<=>$b} keys %{ $nameToIDs{$type}{$objName} })
    if ($has_data) { $all_entry .= $entry; }
  } # foreach my $objName (sort keys %{ $nameToIDs{$type} })
  return( $all_entry, $err_text );# returns all entries, no error checking in place for now;
} # sub getDisease

__END__

sub getData {
  my ($cur_entry, $table, $joinkey, $tag, $objName, $goodGenes_ref) = @_;
  if ($theHash{$table}{$joinkey}) {
    my $data = $theHash{$table}{$joinkey};
    if ($data =~ m/^\"/) { $data =~ s/^\"//; }
    if ($data =~ m/\"$/) { $data =~ s/\"$//; }
    if ($data =~ m/
/) { $data =~ s/
//g; }
    if ($data =~ m/\n/) { $data =~ s/\n/  /g; }
    if ($data =~ m/^\s+/) { $data =~ s/^\s+//g; } if ($data =~ m/\s+$/) { $data =~ s/\s+$//g; }
    my @data;
    if ($data =~ m/\",\"/) { @data = split/\",\"/, $data; }
      elsif ($pipeSplit{$table}) { @data = split/ \| /, $data; }
      else { push @data, $data; }
    foreach my $value (@data) {
      if ($value =~ m/\"/) { $value =~ s/\"/\\\"/g; }
    } # foreach my $value (@data)
  }
  return $cur_entry;
} # sub getData

use_package.pl


#!/usr/bin/perl

# use the get_paper_ace.pm module from /home/postgres/work/citace_upload/papers/ 
# to dump the papers, abstracts (LongText objects), and errors associated with
# them.  2005 07 13
#
# Change to default get all papers, not just valid ones.  2005 11 10

use strict;
use Jex;

my $date = &getSimpleSecDate();
my $start_time = time;
my $estimate_time = time + 697;
my ($sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdst) = localtime($estimate_time);             # get time
if ($sec < 10) { $sec = "0$sec"; }    # add a zero if needed
print "START $date -> Estimate $hour:$min:$sec\n";

$date = &getSimpleDate();

use lib qw( /home/postgres/work/citace_upload/dis_disease/ );
use get_dis_disease_ace; #tells script where to get the perl module packages

my $outfile = 'disease_' . $date . '.ace';
my $errfile = 'err.out.' . $date; #has two outputs, .ace file and error files, changed file name to 

open (OUT, ">$outfile") or die "Cannot create $outfile : $!\n";
open (ERR, ">$errfile") or die "Cannot create $errfile : $!\n";


my ($all_entry, $err_text) = &getDisease('all'); # all, gets all objects, need to specify WBGene if only that needed

print OUT "$all_entry\n";
if ($err_text) { print ERR "$err_text"; }  #will print error file, if errors are found, otherwise not

close (OUT) or die "Cannot close $outfile : $!";
close (ERR) or die "Cannot close $errfile : $!";

$date = &getSimpleSecDate();
my $end_time = time;
my $diff_time = $end_time - $start_time;
print "DIFF $diff_time\n";
print "END $date\n";

Back To OA_and_scripts_for_disease_data