Updating Postgres with New WS Information
From WormBaseWiki
Jump to navigationJump to searchThis page is dedicated to documenting the script:
populate_pg_from_ws.pl
located at:
/home/postgres/work/pgpopulation/obo_oa_ontologies/ws_updates
which runs as a cronjob at 5am Pacific time every day:
0 5 * * * /home/postgres/work/pgpopulation/obo_oa_ontologies/ws_updates/populate_pg_from_ws.pl
use Ace; use strict; use diagnostics; use Jex; use DBI; my $dbh = DBI->connect ( "dbi:Pg:dbname=testdb", "", "") or die "Cannot connect to database!\n"; my $result = ''; my $ws_files_dir = '/home/postgres/work/pgpopulation/obo_oa_ontologies/ws_updates/files/'; # for testing on mangolassi # my $ws_files_dir = '/home3/acedb/cron/dump_from_ws/files/'; # on tazendra to go live my $pg_files_dir = '/home/postgres/work/pgpopulation/obo_oa_ontologies/ws_updates/files/'; my $latestUpdateFile = '/home3/acedb/cron/dump_from_ws/files/latestDate'; open (IN, "<$latestUpdateFile") or die "Cannot open $latestUpdateFile : $!"; my ($wsUpdateDate) = <IN>; chomp $wsUpdateDate; close (IN) or die "Cannot close $latestUpdateFile : $!"; $result = $dbh->prepare( "SELECT * FROM gin_molname ORDER BY gin_timestamp DESC;" ); $result->execute() or die "Cannot prepare statement: $DBI::errstr\n"; my @row = $result->fetchrow(); my $pgTimestamp = $row[2]; my ($year, $month, $day) = $pgTimestamp =~ m/^(\d{4})-(\d{2})-(\d{2})/; my $fromPgSimpleDate = $year . $month . $day; # if ($wsUpdateDate > $fromPgSimpleDate) { print "do stuff\n"; } else { print "do nothing\n"; } my $timestamp = &getPgDate(); if ($wsUpdateDate > $fromPgSimpleDate) { # if ws was updated more recently than postgres timestamp, do the updates &processGeneCds(); &processExprCluster(); &processFeature(); # print "DONE\n"; }
sub processGeneCds { my @pgcommands; my %cdsToProt; my $molfile = $pg_files_dir . 'gin_molname.pg'; my $seqfile = $pg_files_dir . 'gin_sequence.pg'; my $protfile = $pg_files_dir . 'gin_protein.pg'; my $seqprotfile = $pg_files_dir . 'gin_seqprot.pg'; open (MOL, ">$molfile") or die "Cannot create $molfile : $!"; open (SEQ, ">$seqfile") or die "Cannot create $seqfile : $!"; open (PRO, ">$protfile") or die "Cannot create $protfile : $!"; open (SPR, ">$seqprotfile") or die "Cannot create $seqprotfile : $!"; $/ = ""; my $infile = $ws_files_dir . 'WSCDS.ace'; open (IN, "<$infile") or die "Cannot open $infile : $!"; while (my $entry = <IN>) { next unless ($entry =~ m/CDS : \"/); $entry =~ s/\\//g; # take out all backslashes my (@lines) = split/\n/, $entry; my $header = shift @lines; my $cds = ''; if ($header =~ m/CDS : \"([^"]+)\"/) { $cds = $1; } foreach my $line (@lines) { if ($line =~ m/^Corresponding_protein\t \"(.*)\"/) { $cdsToProt{$cds} = $1; } } } # while (my $entry = <IN>) close (IN) or die "Cannot close $infile : $!"; $infile = $ws_files_dir . 'WSGene.ace'; open (IN, "<$infile") or die "Cannot open $infile : $!"; while (my $entry = <IN>) { next unless ($entry =~ m/Gene : \"/); $entry =~ s/\\//g; # take out all backslashes my %data; my (@lines) = split/\n/, $entry; my $header = shift @lines; my $joinkey = ''; if ($header =~ m/Gene : \"WBGene([^"]+)\"/) { $joinkey = $1; } my @tags = qw( Molecular_name Corresponding_transcript Corresponding_CDS ); foreach my $line (@lines) { foreach my $tag (@tags) { if ($line =~ m/^$tag\t \"(.*)\"/) { $data{$tag}{$1}++; } } } foreach my $molname (sort keys %{ $data{"Molecular_name"} }) { print MOL qq($joinkey\t$molname\t$timestamp\n); } foreach my $sequence (sort keys %{ $data{"Corresponding_transcript"} }) { print SEQ qq($joinkey\t$sequence\t$timestamp\n); } foreach my $sequence (sort keys %{ $data{"Corresponding_CDS"} }) { print SEQ qq($joinkey\t$sequence\t$timestamp\n); if ($cdsToProt{$sequence}) { my $protein = $cdsToProt{$sequence}; print SPR qq($joinkey\t$sequence\t$protein\t$timestamp\n); print PRO qq($joinkey\t$protein\t$timestamp\n); } } } # while (my $entry = <IN>) close (IN) or die "Cannot close $infile : $!"; $/ = "\n"; close (MOL) or die "Cannot close $molfile : $!"; close (SEQ) or die "Cannot close $seqfile : $!"; close (PRO) or die "Cannot close $protfile : $!"; close (SPR) or die "Cannot close $seqprotfile : $!"; push @pgcommands, qq(DELETE FROM gin_molname;); push @pgcommands, qq(DELETE FROM gin_sequence;); push @pgcommands, qq(DELETE FROM gin_protein;); push @pgcommands, qq(DELETE FROM gin_seqprot;); push @pgcommands, qq(COPY gin_molname FROM '$molfile';); push @pgcommands, qq(COPY gin_sequence FROM '$seqfile';); push @pgcommands, qq(COPY gin_protein FROM '$protfile';); push @pgcommands, qq(COPY gin_seqprot FROM '$seqprotfile';); foreach my $pgcommand (@pgcommands) { print qq($pgcommand\n); $dbh->do( $pgcommand ); } # foreach my $pgcommand (@pgcommands) } # sub processGeneCds