Difference between revisions of "Updating Postgres with New WS Information"
From WormBaseWiki
Jump to navigationJump to searchm (Created page with 'This page is dedicated to documenting the script: populate_pg_from_ws.pl located at: /home/postgres/work/pgpopulation/obo_oa_ontologies/ws_updates which runs as a cronjob at …') |
m |
||
Line 10: | Line 10: | ||
0 5 * * * /home/postgres/work/pgpopulation/obo_oa_ontologies/ws_updates/populate_pg_from_ws.pl | 0 5 * * * /home/postgres/work/pgpopulation/obo_oa_ontologies/ws_updates/populate_pg_from_ws.pl | ||
+ | |||
+ | |||
+ | <pre> | ||
+ | use Ace; | ||
+ | use strict; | ||
+ | use diagnostics; | ||
+ | use Jex; | ||
+ | use DBI; | ||
+ | |||
+ | my $dbh = DBI->connect ( "dbi:Pg:dbname=testdb", "", "") or die "Cannot connect to database!\n"; | ||
+ | my $result = ''; | ||
+ | |||
+ | my $ws_files_dir = '/home/postgres/work/pgpopulation/obo_oa_ontologies/ws_updates/files/'; # for testing on mangolassi | ||
+ | # my $ws_files_dir = '/home3/acedb/cron/dump_from_ws/files/'; # on tazendra to go live | ||
+ | my $pg_files_dir = '/home/postgres/work/pgpopulation/obo_oa_ontologies/ws_updates/files/'; | ||
+ | |||
+ | my $latestUpdateFile = '/home3/acedb/cron/dump_from_ws/files/latestDate'; | ||
+ | open (IN, "<$latestUpdateFile") or die "Cannot open $latestUpdateFile : $!"; | ||
+ | my ($wsUpdateDate) = <IN>; chomp $wsUpdateDate; | ||
+ | close (IN) or die "Cannot close $latestUpdateFile : $!"; | ||
+ | |||
+ | $result = $dbh->prepare( "SELECT * FROM gin_molname ORDER BY gin_timestamp DESC;" ); | ||
+ | $result->execute() or die "Cannot prepare statement: $DBI::errstr\n"; | ||
+ | my @row = $result->fetchrow(); my $pgTimestamp = $row[2]; | ||
+ | my ($year, $month, $day) = $pgTimestamp =~ m/^(\d{4})-(\d{2})-(\d{2})/; | ||
+ | my $fromPgSimpleDate = $year . $month . $day; | ||
+ | |||
+ | # if ($wsUpdateDate > $fromPgSimpleDate) { print "do stuff\n"; } else { print "do nothing\n"; } | ||
+ | |||
+ | my $timestamp = &getPgDate(); | ||
+ | if ($wsUpdateDate > $fromPgSimpleDate) { # if ws was updated more recently than postgres timestamp, do the updates | ||
+ | &processGeneCds(); | ||
+ | &processExprCluster(); | ||
+ | &processFeature(); | ||
+ | # print "DONE\n"; | ||
+ | } | ||
+ | </pre> | ||
+ | |||
+ | <pre> | ||
+ | sub processGeneCds { | ||
+ | my @pgcommands; | ||
+ | my %cdsToProt; | ||
+ | my $molfile = $pg_files_dir . 'gin_molname.pg'; | ||
+ | my $seqfile = $pg_files_dir . 'gin_sequence.pg'; | ||
+ | my $protfile = $pg_files_dir . 'gin_protein.pg'; | ||
+ | my $seqprotfile = $pg_files_dir . 'gin_seqprot.pg'; | ||
+ | open (MOL, ">$molfile") or die "Cannot create $molfile : $!"; | ||
+ | open (SEQ, ">$seqfile") or die "Cannot create $seqfile : $!"; | ||
+ | open (PRO, ">$protfile") or die "Cannot create $protfile : $!"; | ||
+ | open (SPR, ">$seqprotfile") or die "Cannot create $seqprotfile : $!"; | ||
+ | $/ = ""; | ||
+ | my $infile = $ws_files_dir . 'WSCDS.ace'; | ||
+ | open (IN, "<$infile") or die "Cannot open $infile : $!"; | ||
+ | while (my $entry = <IN>) { | ||
+ | next unless ($entry =~ m/CDS : \"/); | ||
+ | $entry =~ s/\\//g; # take out all backslashes | ||
+ | my (@lines) = split/\n/, $entry; | ||
+ | my $header = shift @lines; | ||
+ | my $cds = ''; | ||
+ | if ($header =~ m/CDS : \"([^"]+)\"/) { $cds = $1; } | ||
+ | foreach my $line (@lines) { | ||
+ | if ($line =~ m/^Corresponding_protein\t \"(.*)\"/) { $cdsToProt{$cds} = $1; } } | ||
+ | } # while (my $entry = <IN>) | ||
+ | close (IN) or die "Cannot close $infile : $!"; | ||
+ | $infile = $ws_files_dir . 'WSGene.ace'; | ||
+ | open (IN, "<$infile") or die "Cannot open $infile : $!"; | ||
+ | while (my $entry = <IN>) { | ||
+ | next unless ($entry =~ m/Gene : \"/); | ||
+ | $entry =~ s/\\//g; # take out all backslashes | ||
+ | my %data; | ||
+ | my (@lines) = split/\n/, $entry; | ||
+ | my $header = shift @lines; | ||
+ | my $joinkey = ''; | ||
+ | if ($header =~ m/Gene : \"WBGene([^"]+)\"/) { $joinkey = $1; } | ||
+ | my @tags = qw( Molecular_name Corresponding_transcript Corresponding_CDS ); | ||
+ | foreach my $line (@lines) { | ||
+ | foreach my $tag (@tags) { | ||
+ | if ($line =~ m/^$tag\t \"(.*)\"/) { $data{$tag}{$1}++; } } } | ||
+ | foreach my $molname (sort keys %{ $data{"Molecular_name"} }) { | ||
+ | print MOL qq($joinkey\t$molname\t$timestamp\n); } | ||
+ | foreach my $sequence (sort keys %{ $data{"Corresponding_transcript"} }) { | ||
+ | print SEQ qq($joinkey\t$sequence\t$timestamp\n); } | ||
+ | foreach my $sequence (sort keys %{ $data{"Corresponding_CDS"} }) { | ||
+ | print SEQ qq($joinkey\t$sequence\t$timestamp\n); | ||
+ | if ($cdsToProt{$sequence}) { | ||
+ | my $protein = $cdsToProt{$sequence}; | ||
+ | print SPR qq($joinkey\t$sequence\t$protein\t$timestamp\n); | ||
+ | print PRO qq($joinkey\t$protein\t$timestamp\n); } } | ||
+ | } # while (my $entry = <IN>) | ||
+ | close (IN) or die "Cannot close $infile : $!"; | ||
+ | $/ = "\n"; | ||
+ | close (MOL) or die "Cannot close $molfile : $!"; | ||
+ | close (SEQ) or die "Cannot close $seqfile : $!"; | ||
+ | close (PRO) or die "Cannot close $protfile : $!"; | ||
+ | close (SPR) or die "Cannot close $seqprotfile : $!"; | ||
+ | push @pgcommands, qq(DELETE FROM gin_molname;); | ||
+ | push @pgcommands, qq(DELETE FROM gin_sequence;); | ||
+ | push @pgcommands, qq(DELETE FROM gin_protein;); | ||
+ | push @pgcommands, qq(DELETE FROM gin_seqprot;); | ||
+ | push @pgcommands, qq(COPY gin_molname FROM '$molfile';); | ||
+ | push @pgcommands, qq(COPY gin_sequence FROM '$seqfile';); | ||
+ | push @pgcommands, qq(COPY gin_protein FROM '$protfile';); | ||
+ | push @pgcommands, qq(COPY gin_seqprot FROM '$seqprotfile';); | ||
+ | |||
+ | foreach my $pgcommand (@pgcommands) { | ||
+ | print qq($pgcommand\n); | ||
+ | $dbh->do( $pgcommand ); | ||
+ | } # foreach my $pgcommand (@pgcommands) | ||
+ | } # sub processGeneCds | ||
+ | </pre> |
Revision as of 23:38, 14 November 2013
This page is dedicated to documenting the script:
populate_pg_from_ws.pl
located at:
/home/postgres/work/pgpopulation/obo_oa_ontologies/ws_updates
which runs as a cronjob at 5am Pacific time every day:
0 5 * * * /home/postgres/work/pgpopulation/obo_oa_ontologies/ws_updates/populate_pg_from_ws.pl
use Ace; use strict; use diagnostics; use Jex; use DBI; my $dbh = DBI->connect ( "dbi:Pg:dbname=testdb", "", "") or die "Cannot connect to database!\n"; my $result = ''; my $ws_files_dir = '/home/postgres/work/pgpopulation/obo_oa_ontologies/ws_updates/files/'; # for testing on mangolassi # my $ws_files_dir = '/home3/acedb/cron/dump_from_ws/files/'; # on tazendra to go live my $pg_files_dir = '/home/postgres/work/pgpopulation/obo_oa_ontologies/ws_updates/files/'; my $latestUpdateFile = '/home3/acedb/cron/dump_from_ws/files/latestDate'; open (IN, "<$latestUpdateFile") or die "Cannot open $latestUpdateFile : $!"; my ($wsUpdateDate) = <IN>; chomp $wsUpdateDate; close (IN) or die "Cannot close $latestUpdateFile : $!"; $result = $dbh->prepare( "SELECT * FROM gin_molname ORDER BY gin_timestamp DESC;" ); $result->execute() or die "Cannot prepare statement: $DBI::errstr\n"; my @row = $result->fetchrow(); my $pgTimestamp = $row[2]; my ($year, $month, $day) = $pgTimestamp =~ m/^(\d{4})-(\d{2})-(\d{2})/; my $fromPgSimpleDate = $year . $month . $day; # if ($wsUpdateDate > $fromPgSimpleDate) { print "do stuff\n"; } else { print "do nothing\n"; } my $timestamp = &getPgDate(); if ($wsUpdateDate > $fromPgSimpleDate) { # if ws was updated more recently than postgres timestamp, do the updates &processGeneCds(); &processExprCluster(); &processFeature(); # print "DONE\n"; }
sub processGeneCds { my @pgcommands; my %cdsToProt; my $molfile = $pg_files_dir . 'gin_molname.pg'; my $seqfile = $pg_files_dir . 'gin_sequence.pg'; my $protfile = $pg_files_dir . 'gin_protein.pg'; my $seqprotfile = $pg_files_dir . 'gin_seqprot.pg'; open (MOL, ">$molfile") or die "Cannot create $molfile : $!"; open (SEQ, ">$seqfile") or die "Cannot create $seqfile : $!"; open (PRO, ">$protfile") or die "Cannot create $protfile : $!"; open (SPR, ">$seqprotfile") or die "Cannot create $seqprotfile : $!"; $/ = ""; my $infile = $ws_files_dir . 'WSCDS.ace'; open (IN, "<$infile") or die "Cannot open $infile : $!"; while (my $entry = <IN>) { next unless ($entry =~ m/CDS : \"/); $entry =~ s/\\//g; # take out all backslashes my (@lines) = split/\n/, $entry; my $header = shift @lines; my $cds = ''; if ($header =~ m/CDS : \"([^"]+)\"/) { $cds = $1; } foreach my $line (@lines) { if ($line =~ m/^Corresponding_protein\t \"(.*)\"/) { $cdsToProt{$cds} = $1; } } } # while (my $entry = <IN>) close (IN) or die "Cannot close $infile : $!"; $infile = $ws_files_dir . 'WSGene.ace'; open (IN, "<$infile") or die "Cannot open $infile : $!"; while (my $entry = <IN>) { next unless ($entry =~ m/Gene : \"/); $entry =~ s/\\//g; # take out all backslashes my %data; my (@lines) = split/\n/, $entry; my $header = shift @lines; my $joinkey = ''; if ($header =~ m/Gene : \"WBGene([^"]+)\"/) { $joinkey = $1; } my @tags = qw( Molecular_name Corresponding_transcript Corresponding_CDS ); foreach my $line (@lines) { foreach my $tag (@tags) { if ($line =~ m/^$tag\t \"(.*)\"/) { $data{$tag}{$1}++; } } } foreach my $molname (sort keys %{ $data{"Molecular_name"} }) { print MOL qq($joinkey\t$molname\t$timestamp\n); } foreach my $sequence (sort keys %{ $data{"Corresponding_transcript"} }) { print SEQ qq($joinkey\t$sequence\t$timestamp\n); } foreach my $sequence (sort keys %{ $data{"Corresponding_CDS"} }) { print SEQ qq($joinkey\t$sequence\t$timestamp\n); if ($cdsToProt{$sequence}) { my $protein = $cdsToProt{$sequence}; print SPR qq($joinkey\t$sequence\t$protein\t$timestamp\n); print PRO qq($joinkey\t$protein\t$timestamp\n); } } } # while (my $entry = <IN>) close (IN) or die "Cannot close $infile : $!"; $/ = "\n"; close (MOL) or die "Cannot close $molfile : $!"; close (SEQ) or die "Cannot close $seqfile : $!"; close (PRO) or die "Cannot close $protfile : $!"; close (SPR) or die "Cannot close $seqprotfile : $!"; push @pgcommands, qq(DELETE FROM gin_molname;); push @pgcommands, qq(DELETE FROM gin_sequence;); push @pgcommands, qq(DELETE FROM gin_protein;); push @pgcommands, qq(DELETE FROM gin_seqprot;); push @pgcommands, qq(COPY gin_molname FROM '$molfile';); push @pgcommands, qq(COPY gin_sequence FROM '$seqfile';); push @pgcommands, qq(COPY gin_protein FROM '$protfile';); push @pgcommands, qq(COPY gin_seqprot FROM '$seqprotfile';); foreach my $pgcommand (@pgcommands) { print qq($pgcommand\n); $dbh->do( $pgcommand ); } # foreach my $pgcommand (@pgcommands) } # sub processGeneCds