Updating Postgres with New WS Information
From WormBaseWiki
Jump to navigationJump to searchThis page is dedicated to documenting the script:
populate_pg_from_ws.pl
located at:
/home/postgres/work/pgpopulation/obo_oa_ontologies/ws_updates
which runs as a cronjob at 5am Pacific time every day:
0 5 * * * /home/postgres/work/pgpopulation/obo_oa_ontologies/ws_updates/populate_pg_from_ws.pl
Here is the code for the script:
use Ace; use strict; use diagnostics; use Jex; use DBI; my $dbh = DBI->connect ( "dbi:Pg:dbname=testdb", "", "") or die "Cannot connect to database!\n"; my $result = ''; my $ws_files_dir = '/home/postgres/work/pgpopulation/obo_oa_ontologies/ws_updates/files/'; # for testing on mangolassi # my $ws_files_dir = '/home3/acedb/cron/dump_from_ws/files/'; # on tazendra to go live my $pg_files_dir = '/home/postgres/work/pgpopulation/obo_oa_ontologies/ws_updates/files/'; my $latestUpdateFile = '/home3/acedb/cron/dump_from_ws/files/latestDate'; open (IN, "<$latestUpdateFile") or die "Cannot open $latestUpdateFile : $!"; my ($wsUpdateDate) = <IN>; chomp $wsUpdateDate; close (IN) or die "Cannot close $latestUpdateFile : $!"; $result = $dbh->prepare( "SELECT * FROM gin_molname ORDER BY gin_timestamp DESC;" ); $result->execute() or die "Cannot prepare statement: $DBI::errstr\n"; my @row = $result->fetchrow(); my $pgTimestamp = $row[2]; my ($year, $month, $day) = $pgTimestamp =~ m/^(\d{4})-(\d{2})-(\d{2})/; my $fromPgSimpleDate = $year . $month . $day; # if ($wsUpdateDate > $fromPgSimpleDate) { print "do stuff\n"; } else { print "do nothing\n"; } my $timestamp = &getPgDate(); if ($wsUpdateDate > $fromPgSimpleDate) { # if ws was updated more recently than postgres timestamp, do the updates &processGeneCds(); &processExprCluster(); &processFeature(); # print "DONE\n"; }
sub processGeneCds { my @pgcommands; my %cdsToProt; my $molfile = $pg_files_dir . 'gin_molname.pg'; my $seqfile = $pg_files_dir . 'gin_sequence.pg'; my $protfile = $pg_files_dir . 'gin_protein.pg'; my $seqprotfile = $pg_files_dir . 'gin_seqprot.pg'; open (MOL, ">$molfile") or die "Cannot create $molfile : $!"; open (SEQ, ">$seqfile") or die "Cannot create $seqfile : $!"; open (PRO, ">$protfile") or die "Cannot create $protfile : $!"; open (SPR, ">$seqprotfile") or die "Cannot create $seqprotfile : $!"; $/ = ""; my $infile = $ws_files_dir . 'WSCDS.ace'; open (IN, "<$infile") or die "Cannot open $infile : $!"; while (my $entry = <IN>) { next unless ($entry =~ m/CDS : \"/); $entry =~ s/\\//g; # take out all backslashes my (@lines) = split/\n/, $entry; my $header = shift @lines; my $cds = ''; if ($header =~ m/CDS : \"([^"]+)\"/) { $cds = $1; } foreach my $line (@lines) { if ($line =~ m/^Corresponding_protein\t \"(.*)\"/) { $cdsToProt{$cds} = $1; } } } # while (my $entry = <IN>) close (IN) or die "Cannot close $infile : $!"; $infile = $ws_files_dir . 'WSGene.ace'; open (IN, "<$infile") or die "Cannot open $infile : $!"; while (my $entry = <IN>) { next unless ($entry =~ m/Gene : \"/); $entry =~ s/\\//g; # take out all backslashes my %data; my (@lines) = split/\n/, $entry; my $header = shift @lines; my $joinkey = ''; if ($header =~ m/Gene : \"WBGene([^"]+)\"/) { $joinkey = $1; } my @tags = qw( Molecular_name Corresponding_transcript Corresponding_CDS ); foreach my $line (@lines) { foreach my $tag (@tags) { if ($line =~ m/^$tag\t \"(.*)\"/) { $data{$tag}{$1}++; } } } foreach my $molname (sort keys %{ $data{"Molecular_name"} }) { print MOL qq($joinkey\t$molname\t$timestamp\n); } foreach my $sequence (sort keys %{ $data{"Corresponding_transcript"} }) { print SEQ qq($joinkey\t$sequence\t$timestamp\n); } foreach my $sequence (sort keys %{ $data{"Corresponding_CDS"} }) { print SEQ qq($joinkey\t$sequence\t$timestamp\n); if ($cdsToProt{$sequence}) { my $protein = $cdsToProt{$sequence}; print SPR qq($joinkey\t$sequence\t$protein\t$timestamp\n); print PRO qq($joinkey\t$protein\t$timestamp\n); } } } # while (my $entry = <IN>) close (IN) or die "Cannot close $infile : $!"; $/ = "\n"; close (MOL) or die "Cannot close $molfile : $!"; close (SEQ) or die "Cannot close $seqfile : $!"; close (PRO) or die "Cannot close $protfile : $!"; close (SPR) or die "Cannot close $seqprotfile : $!"; push @pgcommands, qq(DELETE FROM gin_molname;); push @pgcommands, qq(DELETE FROM gin_sequence;); push @pgcommands, qq(DELETE FROM gin_protein;); push @pgcommands, qq(DELETE FROM gin_seqprot;); push @pgcommands, qq(COPY gin_molname FROM '$molfile';); push @pgcommands, qq(COPY gin_sequence FROM '$seqfile';); push @pgcommands, qq(COPY gin_protein FROM '$protfile';); push @pgcommands, qq(COPY gin_seqprot FROM '$seqprotfile';); foreach my $pgcommand (@pgcommands) { print qq($pgcommand\n); $dbh->do( $pgcommand ); } # foreach my $pgcommand (@pgcommands) } # sub processGeneCds
sub processExprCluster { my @pgcommands; my @tags = qw( Description Reference Remark ); my %tags; foreach (@tags) { $tags{$_}++; } my $name_file = $pg_files_dir . 'obo_name_exprcluster.pg'; my $data_file = $pg_files_dir . 'obo_data_exprcluster.pg'; open (NAME, ">$name_file") or die "Cannot create $name_file : $!"; open (DATA, ">$data_file") or die "Cannot create $data_file : $!"; my $infile = $ws_files_dir . 'WSExpression_cluster.ace'; $/ = ""; open (IN, "<$infile") or die "Cannot open $infile : $!"; while (my $entry = <IN>) { next unless ($entry =~ m/Expression_cluster : \"/); $entry =~ s/\\//g; # take out all backslashes my %data; my (@lines) = split/\n/, $entry; my $header = shift @lines; my $id = ''; if ($header =~ m/Expression_cluster : \"([^"]+)\"/) { $id = $1; } next if ($id =~ m/^WBPaper00029359/); # skip for Karen, chronograms. 2013 10 01 my @all_data; push @all_data, qq(id: $id); push @all_data, qq(name: "$id"); foreach my $line (@lines) { foreach my $tag (@tags) { if ($line =~ m/^$tag\t \"(.*)\"/) { $data{$tag}{$1}++; } } } foreach my $tag (@tags) { foreach my $data (sort keys %{ $data{$tag} }) { my $lctag = lc($tag); push @all_data, qq($lctag: "$data"); } } my $all_data = join"\\n", @all_data; if ($all_data =~ m/\'/) { $all_data =~ s/\'/''/g; } print NAME qq($id\t$id\t$timestamp\n); print DATA qq($id\t$all_data\t$timestamp\n); } # while (my $entry = <IN>) close (IN) or die "Cannot close $infile : $!"; $/ = "\n"; close (DATA) or die "Cannot close $data_file : $!"; close (NAME) or die "Cannot close $name_file : $!"; push @pgcommands, qq(DELETE FROM obo_name_exprcluster;); push @pgcommands, qq(DELETE FROM obo_data_exprcluster;); push @pgcommands, qq(COPY obo_name_exprcluster FROM '$name_file';); push @pgcommands, qq(COPY obo_data_exprcluster FROM '$data_file';); foreach my $pgcommand (@pgcommands) { print qq($pgcommand\n); $dbh->do( $pgcommand ); } # foreach my $pgcommand (@pgcommands) } # sub processExprCluster
sub processFeature { my @pgcommands; my @tags = qw( Public_name DNA_text Species Defined_by_paper Associated_with_gene Associated_with_Interaction Associated_with_expression_pattern Bound_by_product_of Transcription_factor Method ); my %allowedMethods; my %expectedMethods; my %unexpectedMethods; my @allowedMethods = qw( binding_site binding_site_region DNAseI_hypersensitive_site enhancer histone_binding_site_region promoter regulatory_region TF_binding_site TF_binding_site_region ); my @rejectedMethods = qw( Corrected_genome_sequence_error Genome_sequence_error history history_feature micro_ORF polyA_signal_sequence polyA_site segmental_duplication SL1 SL2 three_prime_UTR transcription_end_site transcription_start_site TSS_region ); foreach (@allowedMethods) { $allowedMethods{$_}++; $expectedMethods{$_}++; } foreach (@rejectedMethods) { $expectedMethods{$_}++; } my %tags; foreach (@tags) { $tags{$_}++; } my $name_file = $pg_files_dir . 'obo_name_feature.pg'; my $data_file = $pg_files_dir . 'obo_data_feature.pg'; open (NAME, ">$name_file") or die "Cannot create $name_file : $!"; open (DATA, ">$data_file") or die "Cannot create $data_file : $!"; my $infile = $ws_files_dir . 'WSFeature.ace'; $/ = ""; open (IN, "<$infile") or die "Cannot open $infile : $!"; while (my $entry = <IN>) { next unless ($entry =~ m/Feature : \"WBsf/); next unless ($entry =~ m/Method\s+\"(.*)\"/); my $method = $1; unless ($expectedMethods{$method}) { $unexpectedMethods{$method}++; } next unless ($allowedMethods{$method}); $entry =~ s/\\//g; # take out all backslashes my %data; my (@lines) = split/\n/, $entry; my $header = shift @lines; my $id = ''; if ($header =~ m/Feature : \"(WBsf\d+)\"/) { $id = $1; } my @all_data; push @all_data, qq(id: $id); push @all_data, qq(name: "$id"); foreach my $line (@lines) { foreach my $tag (@tags) { if ($line =~ m/^$tag\t \"(.*)\"/) { $data{$tag}{$1}++; } } } foreach my $tag (@tags) { foreach my $data (sort keys %{ $data{$tag} }) { my $lctag = lc($tag); push @all_data, qq($lctag: "$data"); } } my $all_data = join"\\n", @all_data; if ($all_data =~ m/\'/) { $all_data =~ s/\'/''/g; } print NAME qq($id\t$id\t$timestamp\n); print DATA qq($id\t$all_data\t$timestamp\n); } # while (my $entry = <IN>) close (IN) or die "Cannot close $infile : $!"; $/ = "\n"; close (DATA) or die "Cannot close $data_file : $!"; close (NAME) or die "Cannot close $name_file : $!"; push @pgcommands, qq(DELETE FROM obo_name_feature;); push @pgcommands, qq(DELETE FROM obo_data_feature;); push @pgcommands, qq(COPY obo_name_feature FROM '$name_file';); push @pgcommands, qq(COPY obo_data_feature FROM '$data_file';); if (scalar %unexpectedMethods > 0) { my $body = join", ", keys %unexpectedMethods; my $user = 'populate_pg_from_ws.pl'; my $email = 'cgrove@caltech.edu'; my $subject = 'unexpected feature methods'; &mailer($user, $email, $subject, $body); } # if (scalar %unexpectedMethods > 0) foreach my $pgcommand (@pgcommands) { print qq($pgcommand\n); $dbh->do( $pgcommand ); } # foreach my $pgcommand (@pgcommands) } # sub processFeature