Difference between revisions of "Updating Postgres with New WS Information"

From WormBaseWiki
Jump to navigationJump to search
m (Created page with 'This page is dedicated to documenting the script: populate_pg_from_ws.pl located at: /home/postgres/work/pgpopulation/obo_oa_ontologies/ws_updates which runs as a cronjob at …')
 
m
Line 10: Line 10:
  
 
  0 5 * * * /home/postgres/work/pgpopulation/obo_oa_ontologies/ws_updates/populate_pg_from_ws.pl
 
  0 5 * * * /home/postgres/work/pgpopulation/obo_oa_ontologies/ws_updates/populate_pg_from_ws.pl
 +
 +
 +
<pre>
 +
use Ace;
 +
use strict;
 +
use diagnostics;
 +
use Jex;
 +
use DBI;
 +
 +
my $dbh = DBI->connect ( "dbi:Pg:dbname=testdb", "", "") or die "Cannot connect to database!\n";
 +
my $result = '';
 +
 +
my $ws_files_dir = '/home/postgres/work/pgpopulation/obo_oa_ontologies/ws_updates/files/';      # for testing on mangolassi
 +
# my $ws_files_dir = '/home3/acedb/cron/dump_from_ws/files/';  # on tazendra to go live
 +
my $pg_files_dir = '/home/postgres/work/pgpopulation/obo_oa_ontologies/ws_updates/files/';
 +
 +
my $latestUpdateFile = '/home3/acedb/cron/dump_from_ws/files/latestDate';
 +
open (IN, "<$latestUpdateFile") or die "Cannot open $latestUpdateFile : $!";
 +
my ($wsUpdateDate) = <IN>; chomp $wsUpdateDate;
 +
close (IN) or die "Cannot close $latestUpdateFile : $!";
 +
 +
$result = $dbh->prepare( "SELECT * FROM gin_molname ORDER BY gin_timestamp DESC;" );
 +
$result->execute() or die "Cannot prepare statement: $DBI::errstr\n";
 +
my @row = $result->fetchrow(); my $pgTimestamp = $row[2];
 +
my ($year, $month, $day) = $pgTimestamp =~ m/^(\d{4})-(\d{2})-(\d{2})/;
 +
my $fromPgSimpleDate = $year . $month . $day;
 +
 +
# if ($wsUpdateDate > $fromPgSimpleDate) { print "do stuff\n"; } else { print "do nothing\n"; }
 +
 +
my $timestamp = &getPgDate();
 +
if ($wsUpdateDate > $fromPgSimpleDate) {                        # if ws was updated more recently than postgres timestamp, do the updates
 +
  &processGeneCds();
 +
  &processExprCluster();
 +
  &processFeature();
 +
#  print "DONE\n";
 +
}
 +
</pre>
 +
 +
<pre>
 +
sub processGeneCds {
 +
  my @pgcommands;
 +
  my %cdsToProt;
 +
  my $molfile = $pg_files_dir . 'gin_molname.pg';
 +
  my $seqfile = $pg_files_dir . 'gin_sequence.pg';
 +
  my $protfile = $pg_files_dir . 'gin_protein.pg';
 +
  my $seqprotfile = $pg_files_dir . 'gin_seqprot.pg';
 +
  open (MOL, ">$molfile") or die "Cannot create $molfile : $!";
 +
  open (SEQ, ">$seqfile") or die "Cannot create $seqfile : $!";
 +
  open (PRO, ">$protfile") or die "Cannot create $protfile : $!";
 +
  open (SPR, ">$seqprotfile") or die "Cannot create $seqprotfile : $!";
 +
  $/ = "";
 +
  my $infile = $ws_files_dir . 'WSCDS.ace';
 +
  open (IN, "<$infile") or die "Cannot open $infile : $!";
 +
  while (my $entry = <IN>) {
 +
    next unless ($entry =~ m/CDS : \"/);
 +
    $entry =~ s/\\//g;                          # take out all backslashes
 +
    my (@lines) = split/\n/, $entry;
 +
    my $header = shift @lines;
 +
    my $cds = '';
 +
    if ($header =~ m/CDS : \"([^"]+)\"/) { $cds = $1; }
 +
    foreach my $line (@lines) {
 +
      if ($line =~ m/^Corresponding_protein\t \"(.*)\"/) { $cdsToProt{$cds} = $1; } }
 +
  } # while (my $entry = <IN>)
 +
  close (IN) or die "Cannot close $infile : $!";
 +
  $infile = $ws_files_dir . 'WSGene.ace';
 +
  open (IN, "<$infile") or die "Cannot open $infile : $!";
 +
  while (my $entry = <IN>) {
 +
    next unless ($entry =~ m/Gene : \"/);
 +
    $entry =~ s/\\//g;                          # take out all backslashes
 +
    my %data;
 +
    my (@lines) = split/\n/, $entry;
 +
    my $header = shift @lines;
 +
    my $joinkey = '';
 +
    if ($header =~ m/Gene : \"WBGene([^"]+)\"/) { $joinkey = $1; }
 +
    my @tags = qw( Molecular_name Corresponding_transcript Corresponding_CDS );
 +
    foreach my $line (@lines) {
 +
      foreach my $tag (@tags) {
 +
        if ($line =~ m/^$tag\t \"(.*)\"/) { $data{$tag}{$1}++; } } }
 +
    foreach my $molname (sort keys %{ $data{"Molecular_name"} }) {
 +
      print MOL qq($joinkey\t$molname\t$timestamp\n); }
 +
    foreach my $sequence (sort keys %{ $data{"Corresponding_transcript"} }) {
 +
      print SEQ qq($joinkey\t$sequence\t$timestamp\n); }
 +
    foreach my $sequence (sort keys %{ $data{"Corresponding_CDS"} }) {
 +
      print SEQ qq($joinkey\t$sequence\t$timestamp\n);
 +
      if ($cdsToProt{$sequence}) {
 +
        my $protein = $cdsToProt{$sequence};
 +
        print SPR qq($joinkey\t$sequence\t$protein\t$timestamp\n);
 +
        print PRO qq($joinkey\t$protein\t$timestamp\n); } }
 +
  } # while (my $entry = <IN>)
 +
  close (IN) or die "Cannot close $infile : $!";
 +
  $/ = "\n";
 +
  close (MOL) or die "Cannot close $molfile : $!";
 +
  close (SEQ) or die "Cannot close $seqfile : $!";
 +
  close (PRO) or die "Cannot close $protfile : $!";
 +
  close (SPR) or die "Cannot close $seqprotfile : $!";
 +
  push @pgcommands, qq(DELETE FROM gin_molname;);
 +
  push @pgcommands, qq(DELETE FROM gin_sequence;);
 +
  push @pgcommands, qq(DELETE FROM gin_protein;);
 +
  push @pgcommands, qq(DELETE FROM gin_seqprot;);
 +
  push @pgcommands, qq(COPY gin_molname  FROM '$molfile';);
 +
  push @pgcommands, qq(COPY gin_sequence FROM '$seqfile';);
 +
  push @pgcommands, qq(COPY gin_protein  FROM '$protfile';);
 +
  push @pgcommands, qq(COPY gin_seqprot  FROM '$seqprotfile';);
 +
 +
  foreach my $pgcommand (@pgcommands) {
 +
    print qq($pgcommand\n);
 +
    $dbh->do( $pgcommand );
 +
  } # foreach my $pgcommand (@pgcommands)
 +
} # sub processGeneCds
 +
</pre>

Revision as of 23:38, 14 November 2013

This page is dedicated to documenting the script:

populate_pg_from_ws.pl

located at:

/home/postgres/work/pgpopulation/obo_oa_ontologies/ws_updates

which runs as a cronjob at 5am Pacific time every day:

0 5 * * * /home/postgres/work/pgpopulation/obo_oa_ontologies/ws_updates/populate_pg_from_ws.pl


use Ace;
use strict;
use diagnostics;
use Jex;
use DBI;

my $dbh = DBI->connect ( "dbi:Pg:dbname=testdb", "", "") or die "Cannot connect to database!\n";
my $result = '';

my $ws_files_dir = '/home/postgres/work/pgpopulation/obo_oa_ontologies/ws_updates/files/';      # for testing on mangolassi
# my $ws_files_dir = '/home3/acedb/cron/dump_from_ws/files/';   # on tazendra to go live
my $pg_files_dir = '/home/postgres/work/pgpopulation/obo_oa_ontologies/ws_updates/files/';

my $latestUpdateFile = '/home3/acedb/cron/dump_from_ws/files/latestDate';
open (IN, "<$latestUpdateFile") or die "Cannot open $latestUpdateFile : $!";
my ($wsUpdateDate) = <IN>; chomp $wsUpdateDate;
close (IN) or die "Cannot close $latestUpdateFile : $!";

$result = $dbh->prepare( "SELECT * FROM gin_molname ORDER BY gin_timestamp DESC;" );
$result->execute() or die "Cannot prepare statement: $DBI::errstr\n";
my @row = $result->fetchrow(); my $pgTimestamp = $row[2];
my ($year, $month, $day) = $pgTimestamp =~ m/^(\d{4})-(\d{2})-(\d{2})/;
my $fromPgSimpleDate = $year . $month . $day;

# if ($wsUpdateDate > $fromPgSimpleDate) { print "do stuff\n"; } else { print "do nothing\n"; }

my $timestamp = &getPgDate();
if ($wsUpdateDate > $fromPgSimpleDate) {                        # if ws was updated more recently than postgres timestamp, do the updates
  &processGeneCds();
  &processExprCluster();
  &processFeature();
#   print "DONE\n";
}
sub processGeneCds {
  my @pgcommands;
  my %cdsToProt;
  my $molfile = $pg_files_dir . 'gin_molname.pg';
  my $seqfile = $pg_files_dir . 'gin_sequence.pg';
  my $protfile = $pg_files_dir . 'gin_protein.pg';
  my $seqprotfile = $pg_files_dir . 'gin_seqprot.pg';
  open (MOL, ">$molfile") or die "Cannot create $molfile : $!";
  open (SEQ, ">$seqfile") or die "Cannot create $seqfile : $!";
  open (PRO, ">$protfile") or die "Cannot create $protfile : $!";
  open (SPR, ">$seqprotfile") or die "Cannot create $seqprotfile : $!";
  $/ = "";
  my $infile = $ws_files_dir . 'WSCDS.ace';
  open (IN, "<$infile") or die "Cannot open $infile : $!";
  while (my $entry = <IN>) {
    next unless ($entry =~ m/CDS : \"/);
    $entry =~ s/\\//g;                          # take out all backslashes
    my (@lines) = split/\n/, $entry;
    my $header = shift @lines;
    my $cds = '';
    if ($header =~ m/CDS : \"([^"]+)\"/) { $cds = $1; }
    foreach my $line (@lines) {
      if ($line =~ m/^Corresponding_protein\t \"(.*)\"/) { $cdsToProt{$cds} = $1; } }
  } # while (my $entry = <IN>)
  close (IN) or die "Cannot close $infile : $!";
  $infile = $ws_files_dir . 'WSGene.ace';
  open (IN, "<$infile") or die "Cannot open $infile : $!";
  while (my $entry = <IN>) {
    next unless ($entry =~ m/Gene : \"/);
    $entry =~ s/\\//g;                          # take out all backslashes
    my %data;
    my (@lines) = split/\n/, $entry;
    my $header = shift @lines;
    my $joinkey = '';
    if ($header =~ m/Gene : \"WBGene([^"]+)\"/) { $joinkey = $1; }
    my @tags = qw( Molecular_name Corresponding_transcript Corresponding_CDS );
    foreach my $line (@lines) {
      foreach my $tag (@tags) {
        if ($line =~ m/^$tag\t \"(.*)\"/) { $data{$tag}{$1}++; } } }
    foreach my $molname (sort keys %{ $data{"Molecular_name"} }) {
      print MOL qq($joinkey\t$molname\t$timestamp\n); }
    foreach my $sequence (sort keys %{ $data{"Corresponding_transcript"} }) {
      print SEQ qq($joinkey\t$sequence\t$timestamp\n); }
    foreach my $sequence (sort keys %{ $data{"Corresponding_CDS"} }) {
      print SEQ qq($joinkey\t$sequence\t$timestamp\n);
      if ($cdsToProt{$sequence}) {
        my $protein = $cdsToProt{$sequence};
        print SPR qq($joinkey\t$sequence\t$protein\t$timestamp\n);
        print PRO qq($joinkey\t$protein\t$timestamp\n); } }
  } # while (my $entry = <IN>)
  close (IN) or die "Cannot close $infile : $!";
  $/ = "\n";
  close (MOL) or die "Cannot close $molfile : $!";
  close (SEQ) or die "Cannot close $seqfile : $!";
  close (PRO) or die "Cannot close $protfile : $!";
  close (SPR) or die "Cannot close $seqprotfile : $!";
  push @pgcommands, qq(DELETE FROM gin_molname;);
  push @pgcommands, qq(DELETE FROM gin_sequence;);
  push @pgcommands, qq(DELETE FROM gin_protein;);
  push @pgcommands, qq(DELETE FROM gin_seqprot;);
  push @pgcommands, qq(COPY gin_molname  FROM '$molfile';);
  push @pgcommands, qq(COPY gin_sequence FROM '$seqfile';);
  push @pgcommands, qq(COPY gin_protein  FROM '$protfile';);
  push @pgcommands, qq(COPY gin_seqprot  FROM '$seqprotfile';);

  foreach my $pgcommand (@pgcommands) {
    print qq($pgcommand\n);
    $dbh->do( $pgcommand );
  } # foreach my $pgcommand (@pgcommands)
} # sub processGeneCds