#!/usr/bin/perl
=head1 NAME
meme_xml_to_html - Make a MEME HTML output from a MEME XML output.
=head1 SYNOPSIS
meme_xml_to_html
=cut
use strict;
use warnings;
use Cwd qw(abs_path);
use Fcntl qw(O_RDONLY SEEK_SET);
use File::Basename qw(fileparse);
use File::Spec::Functions qw(tmpdir);
use File::Temp qw(tempfile);
use Getopt::Long;
use Pod::Usage;
use lib '/mit/meme_v4.11.4/lib/perl';
my $etc_dir;
my $temp_dir;
my $scripts_dir;
#
# initialise the global constants
#
sub initialise {
# setup etc dir
$etc_dir = defined($ENV{MEME_ETC_DIR}) ? $ENV{MEME_ETC_DIR} : '/mit/meme_v4.11.4/etc';
# setup temporary directory
$temp_dir = '';
# use the perl default if none is supplied or the replace fails
$temp_dir = tmpdir() if ($temp_dir eq '' || $temp_dir =~ m/^\@TMP[_]DIR\@$/);
# find the location of the script
my $script_name;
($script_name, $scripts_dir) = fileparse(__FILE__);
$scripts_dir = abs_path($scripts_dir);
# add script location to search path
unshift(@INC, $scripts_dir);
require HtmlMonolithWr;
require MemeSAX;
}
sub arguments {
# Set Option Defaults
my $options = {XML_PATH => undef, HTML_PATH => undef};
# General Options
my $help = 0; # FALSE
my @errors = ();
my @dbs = ();
# get the options from the arguments
my $options_success = 0; # FALSE
# redirect stderr to a temp file so we can get the error message from GetOptions
my $olderr;
my $tmperr = tempfile('GetOptions_XXXXXXXXXX', DIR => $temp_dir, UNLINK => 1);
open($olderr, ">&STDERR") or die("Can't dup STDERR: $!");
open(STDERR, '>&', $tmperr) or die("Can't redirect STDERR to temp file: $!");
# parse options
$options_success = GetOptions(
'help|?' => \$help,
);
($options->{XML_PATH}, $options->{HTML_PATH}) = @ARGV;
# display help
pod2usage(1) if $help;
# reset STDERR
open(STDERR, ">&", $olderr) or die("Can't reset STDERR: $!");
# read argument parsing errors
seek($tmperr, 0, SEEK_SET);
while (<$tmperr>) {chomp; push(@errors, $_);}
close($tmperr);
# check source XML file
unless (defined($options->{XML_PATH})) {
push(@errors, "No MEME XML file specified");
} elsif (not -e $options->{XML_PATH}) {
push(@errors, "The MEME XML file specified does not exist");
}
unless (defined($options->{HTML_PATH})) {
push(@errors, "No output file specified");
}
# print errors
foreach my $error (@errors) {
print STDERR $error, "\n";
}
pod2usage(2) if @errors;
# return options
return $options;
}
# start_meme
sub start_meme {
my ($info, $major_ver, $minor_ver, $patch_ver, $release) = @_;
my $wr = $info->{wr};
$wr->str_prop('program', 'MEME');
$wr->str_prop('version', "$major_ver.$minor_ver.$patch_ver");
# meme doesn't include the revision information in the XML
$wr->str_prop('release', $release);
}
# end_meme
sub end_meme {
my ($info) = @_;
my $wr = $info->{wr};
}
# start_training_set
sub start_training_set {
my ($info, $datafile, $length) = @_;
$info->{sequence_db} = {};
$info->{sequence_db}->{source} = $datafile;
$info->{sequence_db}->{temp} = tempfile("seq_db_info_XXXXXX",
SUFFIX => '.tsv', DIR => $temp_dir, UNLINK => 1);
$info->{counter} = 0;
}
# end_training_set
sub end_training_set {
my ($info) = @_;
}
# handle_alphabet
sub handle_alphabet {
my ($info, $alphabet) = @_;
$info->{alph} = $alphabet;
}
# handle_sequence
sub handle_sequence {
my ($info, $num, $name, $length, $weight) = @_;
# num is the number extracted out of the id
# sequence_x where x is from 0 to the number of sequences -1
# and where x increments with each sequence
die("Sequence id number is not expected") unless ($info->{counter} == $num);
# store the sequence information to a temporary file as we don't want to write
# it out yet.
my $fh = $info->{sequence_db}->{temp};
print $fh "$name\t$length\t$weight\n";
$info->{counter}++;
}
# handle_letter_frequencies
sub handle_letter_frequencies {
my ($info, $freqs) = @_;
$info->{sequence_db}->{freqs} = $freqs;
}
# handle_background_frequencies
sub handle_background_frequencies {
my ($info, $source, $freqs) = @_;
$info->{bg} = {source => $source, freqs => $freqs};
}
# end_model
sub handle_settings {
my ($info, %settings) = @_;
my $wr = $info->{wr};
# determine stopping reason
my $reason = $settings{reason_for_stopping};
my $stop_type = "unknown";
my $num_re = qr/((?:[-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?)|inf)/;
my $max_time;
if ($reason =~ m/Stopped because couldn't find any more starting points for EM\./) {
$stop_type = "em";
} elsif ($reason =~ m/Stopped because nmotifs = (\d+) reached\./) {
$stop_type = "motifs";
} elsif ($reason =~ m/Stopped because would probably run out of time \($num_re secs\)\./) {
$stop_type = "time";
$max_time = $1;
} elsif ($reason =~ m/Stopped because next motif has fewer than (\d+) sites\./) {
$stop_type = "sites";
} elsif ($reason =~ m/Stopped because motif E-value > $num_re\./) {
$stop_type = "evalue";
}
#$wr->str_prop("stop_reason", $stop_type);
$wr->str_prop("stop_reason", $reason);
# output command line
$wr->str_array_prop("cmd", split(/\s+/, $settings{command_line}));
# output options
$wr->property("options");
$wr->start_object_value();
# BEGIN OPTIONS (note that I am deliberately using the names of the command line options)
# site distribution
my %dtype = ('zoops' => 'zoops', 'oops' => 'oops', 'anr' => 'anr', 'tcm' => 'anr');
$wr->str_prop("mod", $dtype{$settings{type}});
# reverse complements
$wr->bool_prop("revcomp", $settings{strands} eq "both");
# stop conditions
$wr->num_prop("nmotifs", $settings{nmotifs});
$wr->num_prop("evt", $settings{evalue_threshold}) unless ($settings{evalue_threshold} =~ m/^inf(inity)?$/i);
$wr->num_prop("time", $max_time) if (defined($max_time)); # we only get this value when the stop reason is time
# motif width
$wr->num_prop("minw", $settings{min_width});
$wr->num_prop("maxw", $settings{max_width});
#sites
$wr->num_prop("minsites", $settings{minsites});
$wr->num_prop("maxsites", $settings{maxsites});
$wr->num_prop("wnsites", $settings{wnsites});
# selecting starts for EM
$wr->str_prop("spmap", $settings{spmap});
$wr->num_prop("spfuzz", $settings{spfuzz});
$wr->num_prop("maxwords", $settings{maxwords});
# EM algorithm
$wr->str_prop("prior", $settings{prior});
$wr->num_prop("b", $settings{beta});
$wr->num_prop("maxiter", $settings{maxiter});
$wr->num_prop("distance", $settings{distance});
# trimming alignment costs
$wr->num_prop("wg", $settings{wg});
$wr->num_prop("ws", $settings{ws});
$wr->bool_prop("noendgaps", !$settings{endgaps});
# EM starting point source
$wr->bool_prop("substring", $settings{substring});
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# These experimental options made it into the XML but
# are probably not interesting so I've commented them
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Experimental random sampling (hidden option, default 0)
#$wr->num_prop("seed", $settings{seed});
# sampling probability for subsq. starts (removed option, normally 1 but 0 if mfile set)
# objective function (hidden option, default ev)
my %objfun = ('P-value of product of p-values' => 'pv',
'E-value of product of p-values' => 'ev');
#$wr->str_prop("objfun", $objfun{$settings{object_function}});
# fraction of sequences to use (hidden option, default 1)
#$wr->num_prop("seqfrac", $settings{seqfrac});
# END OPTIONS
$wr->end_object_value();
$wr->property("alphabet");
$info->{alph}->to_json($wr);
$wr->property("background");
$wr->start_object_value();
unless ($info->{bg}->{source} =~ m/^dataset with add-one prior applied$/) {
$wr->str_prop("source", $info->{bg}->{source});
}
$wr->num_array_prop("freqs", @{$info->{bg}->{freqs}});
$wr->end_object_value();
$wr->property("sequence_db");
$wr->start_object_value();
$wr->str_prop("source", $info->{sequence_db}->{source});
$wr->str_prop("psp_source", $settings{priors_file}) if ($settings{priors_file} =~ m/\S/);
$wr->num_array_prop("freqs", @{$info->{sequence_db}->{freqs}});
$wr->property("sequences");
$wr->start_array_value();
my $fh = $info->{sequence_db}->{temp};
seek($fh, 0, SEEK_SET);
while (<$fh>) {
chomp($_);
my ($name, $len, $weight) = split(/\t/, $_);
$wr->start_object_value();
$wr->str_prop("name", $name);
$wr->num_prop("length", $len);
$wr->num_prop("weight", $weight);
$wr->end_object_value();
}
$wr->end_array_value();
$wr->end_object_value();
$info->{sequence_db}->{temp} = undef;
close($fh);
}
# start_background_frequencies
sub start_background_frequencies {
my ($info, $source) = @_;
$info->{background_source} = $source;
}
# end_background_frequencies
sub end_background_frequencies {
my ($info) = @_;
}
# start_bf_alphabet_array
sub start_bf_alphabet_array {
my ($info) = @_;
my $wr = $info->{wr};
}
# end_bf_alphabet_array
sub end_bf_alphabet_array {
my ($info) = @_;
my $wr = $info->{wr};
}
# handle_bf_aa_value
sub handle_bf_aa_value {
my ($info, $letter_id, $frequency) = @_;
my $wr = $info->{wr};
my $letter = $info->{id_to_symbol}->{$letter_id};
die("Expected non-ambiguous letter") if $letter->{ambig};
$info->{background}->{$letter->{symbol}} = $frequency;
}
# start_motifs
sub start_motifs {
my ($info) = @_;
my $wr = $info->{wr};
$wr->property("motifs");
$wr->start_array_value();
}
# end_motifs
sub end_motifs {
my ($info) = @_;
my $wr = $info->{wr};
$wr->end_array_value();
}
# start_motif
sub start_motif {
my ($info, %motif) = @_;
my $wr = $info->{wr};
$wr->start_object_value();
$wr->num_prop("db", 0);
$wr->str_prop("id", $motif{NAME});
$wr->str_prop("alt", $motif{ALT});
$wr->num_prop("len", $motif{WIDTH});
$wr->num_prop("nsites", $motif{SITES});
$wr->str_prop("evalue", $motif{EVALUE});
$wr->num_prop("ic", $motif{IC}); #information content
$wr->num_prop("re", $motif{RE}); #relative entropy
$wr->num_prop("llr", $motif{LLR}); #log likelihood ratio
$wr->num_prop("bt", $motif{BAYES}); # bayes threshold
$wr->num_prop("time", $motif{ELAPSED});
$wr->str_prop("url", $motif{URL}) if $motif{URL};
my $psm = $motif{PSM};
$wr->property("psm");
$wr->start_array_value();
for (my $i = 0; $i < scalar(@{$psm}); $i++) {
$wr->num_array_value(@{$psm->[$i]});
}
$wr->end_array_value();
my $pwm = $motif{PWM};
$wr->property("pwm");
$wr->start_array_value();
for (my $i = 0; $i < scalar(@{$pwm}); $i++) {
$wr->num_array_value(@{$pwm->[$i]});
}
$wr->end_array_value();
$wr->property("sites");
$wr->start_array_value();
}
sub handle_site {
my ($info, $seq_num, $pos, $rc, $pvalue, $lflank, $match, $rflank) = @_;
my $wr = $info->{wr};
$wr->start_object_value();
$wr->num_prop("seq", $seq_num);
$wr->num_prop("pos", $pos);
$wr->bool_prop("rc", $rc);
$wr->num_prop("pvalue", $pvalue);
$wr->str_prop("lflank", $lflank);
$wr->str_prop("match", $match);
$wr->str_prop("rflank", $rflank);
$wr->end_object_value();
}
# end_motif
sub end_motif {
my ($info) = @_;
my $wr = $info->{wr};
$wr->end_array_value();
$wr->end_object_value();
}
# start_scanned_sites_summary
sub start_scanned_sites_summary {
my ($info, $scan_threhold) = @_;
my $wr = $info->{wr};
$wr->property("scan");
$wr->start_array_value();
$info->{counter} = 0;
}
# end_scanned_sites_summary
sub end_scanned_sites_summary {
my ($info) = @_;
my $wr = $info->{wr};
$wr->end_array_value();
}
# start_scanned_sites
sub start_scanned_sites {
my ($info, $num, $pvalue, $num_sites) = @_;
my $wr = $info->{wr};
die("Sequence id number is not expected") unless ($info->{counter} == $num);
$wr->start_object_value();
$wr->num_prop("pvalue", $pvalue);
$wr->property("sites");
$wr->start_array_value();
}
# end_scanned_sites
sub end_scanned_sites {
my ($info) = @_;
my $wr = $info->{wr};
$wr->end_array_value();
$wr->end_object_value();
$info->{counter}++;
}
# handle_scanned_site
sub handle_scanned_site {
my ($info, $motif_num, $strand, $pos, $pvalue) = @_;
my $wr = $info->{wr};
$wr->start_object_value();
$wr->num_prop("motif", $motif_num - 1); # convert to index
$wr->num_prop("pos", $pos); # TODO check is zero based.
$wr->bool_prop("rc", $strand eq 'minus');
$wr->num_prop("pvalue", $pvalue);
$wr->end_object_value();
}
sub transform_data {
my ($opts, $jsonwr) = @_;
my $info = {wr => $jsonwr};
my $sax = new MemeSAX($info,
start_meme => \&start_meme,
start_training_set => \&start_training_set,
handle_alphabet => \&handle_alphabet,
handle_sequence => \&handle_sequence,
end_training_set => \&end_training_set,
handle_letter_frequencies => \&handle_letter_frequencies,
handle_background_frequencies => \&handle_background_frequencies,
handle_settings => \&handle_settings,
start_motifs => \&start_motifs,
start_motif => \&start_motif,
handle_site => \&handle_site,
end_motif => \&end_motif,
end_motifs => \&end_motifs,
start_scanned_sites_summary => \&start_scanned_sites_summary,
start_scanned_sites => \&start_scanned_sites,
handle_scanned_site => \&handle_scanned_site,
end_scanned_sites => \&end_scanned_sites,
end_scanned_sites_summary => \&end_scanned_sites_summary,
end_meme => \&end_meme,
);
my $fh;
sysopen($fh, $opts->{XML_PATH}, O_RDONLY) or die("Failed to open file \"$opts->{XML_PATH}\"\n");
while (<$fh>) {
$sax->parse_more($_);
if ($sax->has_errors()) {
die("Failed to write HTML output due to errors processing the XML:\n" . join("\n", $sax->get_errors()));
}
}
$sax->parse_done();
if ($sax->has_errors()) {
die("Failed to write HTML output due to errors processing the XML:\n" . join("\n", $sax->get_errors()));
}
my @errors = $sax->get_errors();
foreach my $error (@errors) {
print $error, "\n";
}
}
sub main {
&initialise();
my $opts = &arguments();
# start writing HTML
my $htmlwr = new HtmlMonolithWr($etc_dir, 'meme_template.html',
$opts->{HTML_PATH}, 'meme_data.js' => 'data');
# transform the XML into JSON
&transform_data($opts, $htmlwr->output());
# finish writing HTML
$htmlwr->output();
}
&main();
1;