#!/usr/bin/perl =head1 NAME mast_xml_to_txt - Make a MAST text output from a MAST XML output. =head1 SYNOPSIS mast_xml_to_txt =cut use strict; use warnings; use Carp qw(confess); use Cwd qw(abs_path); use Data::Dumper; use Encode qw(encode_utf8 decode_utf8); use Fcntl qw(O_RDONLY O_WRONLY O_CREAT O_TRUNC SEEK_SET SEEK_CUR SEEK_END); use File::Basename qw(fileparse); use File::Copy; use File::Spec::Functions qw(tmpdir); use File::Temp qw(tempfile); use Getopt::Long; use Pod::Usage; use XML::Parser::Expat; use lib '/mit/meme_v4.11.4/lib/perl'; my $INT_SIZE = length(pack('l', 0)); my $DBL_SIZE = length(pack('d', 0)); # some constants my $MAX_COMMENT_LINES = 10; my $PAGE_WIDTH = 80; # try to keep within this many columns my $NAME_WIDTH = 34; # length of the name column my $EV_WIDTH = 8; # length of the E-value column my $SLEN_WIDTH = 6; # length of the sequence length column my $STRAND_WIDTH = 6; # length of the strand column my $FRAME_WIDTH = 5; # length of the frame column my $COMMENT_WIDTH = $PAGE_WIDTH - ($NAME_WIDTH + 1) - 1 - ($EV_WIDTH + 1) - $SLEN_WIDTH; my $etc_dir; my $temp_dir; my $scripts_dir; # # initialise the global constants # sub initialise { # setup etc dir $etc_dir = defined($ENV{MEME_ETC_DIR}) ? $ENV{MEME_ETC_DIR} : '/mit/meme_v4.11.4/etc'; # setup temporary directory $temp_dir = ''; # use the perl default if none is supplied or the replace fails $temp_dir = tmpdir() if ($temp_dir eq '' || $temp_dir =~ m/^\@TMP[_]DIR\@$/); # find the location of the script my $script_name; ($script_name, $scripts_dir) = fileparse(__FILE__); $scripts_dir = abs_path($scripts_dir); # add script location to search path unshift(@INC, $scripts_dir); require HtmlMonolithWr; require MastSAX; } sub arguments { # Set Option Defaults my $options = {XML_PATH => undef, TXT_PATH => undef, DOC => 1}; # General Options my $help = 0; # FALSE my @errors = (); # get the options from the arguments my $options_success = 0; # FALSE # redirect stderr to a temp file so we can get the error message from GetOptions my $olderr; my $tmperr = tempfile('GetOptions_XXXXXXXXXX', DIR => $temp_dir, UNLINK => 1); open($olderr, ">&STDERR") or confess("Can't dup STDERR: $!"); open(STDERR, '>&', $tmperr) or confess("Can't redirect STDERR to temp file: $!"); # parse options $options_success = GetOptions( 'help|?' => \$help, ); ($options->{XML_PATH}, $options->{TXT_PATH}) = @ARGV; # display help pod2usage(1) if $help; # reset STDERR open(STDERR, ">&", $olderr) or confess("Can't reset STDERR: $!"); # read argument parsing errors seek($tmperr, 0, SEEK_SET); while (<$tmperr>) {chomp; push(@errors, $_);} close($tmperr); # check source XML file unless (defined($options->{XML_PATH})) { push(@errors, "No MAST XML file specified"); } elsif (not -e $options->{XML_PATH}) { push(@errors, "The MAST XML file specified does not exist"); } unless (defined($options->{TXT_PATH})) { push(@errors, "No output file specified"); } # print errors foreach my $error (@errors) { print STDERR $error, "\n"; } pod2usage(2) if @errors; # return options return $options; } sub _start_mast { my ($info, $version, $release) = @_; $info->{version} = $version; $info->{release} = $release; } sub _handle_command_line { my ($info, @args) = @_; $info->{args} = \@args; } sub _handle_settings { my ($info, $strand_handling, $max_correlation, $remove_correlated, $max_seq_evalue, $adjust_hit, $max_hit_pvalue, $max_weak_pvalue) = @_; $info->{strand_handling} = $strand_handling; $info->{max_correlation} = $max_correlation; $info->{remove_correlated} = $remove_correlated; $info->{max_seq_evalue} = $max_seq_evalue; $info->{max_hit_pvalue} = $max_hit_pvalue; $info->{max_weak_pvalue} = $max_weak_pvalue; $info->{adj_hit_pvalue} = $adjust_hit; } sub _handle_alphabet { my ($info, $alph) = @_; $info->{alph} = $alph; } sub _handle_sequence_alphabet { my ($info, $seq_alph) = @_; $info->{seq_alph} = $seq_alph; } sub _handle_translate { my ($info, $num_seq, $num_mot) = @_; $info->{xnum} = $num_seq; } sub _handle_background { my ($info, $from, @probs) = @_; $info->{bg_source} = $from; $info->{bg_freqs} = [@probs]; } sub _start_motif_dbs { my ($info) = @_; $info->{motif_dbs} = []; } sub _handle_motif_db { my ($info, $source, $name, $last_mod_date) = @_; my $motifdb = { name => (defined $name ? $name : scalar fileparse($source)) }; push(@{$info->{motif_dbs}}, $motifdb); } sub _end_motif_dbs { my ($info) = @_; } sub _start_motifs { my ($info) = @_; $info->{motifs} = []; } sub _handle_motif { my ($info, $db, $id, $alt, $len, $nsites, $evalue, $bad, $url, $psm) = @_; # convert the psm into the best match my $alph = $info->{alph}; my @bg = @{$info->{bg_freqs}}; my $best_f = ''; for (my $i = 0; $i < scalar(@{$psm}); $i++) { my $best_index = -1; my $best_score = '-inf'; for (my $j = 0; $j < scalar(@{$psm->[$i]}); $j++) { my $score = $psm->[$i]->[$j]; if ($score > $best_score) { $best_index = $j; $best_score = $score; } } $best_f .= $alph->char($best_index); } my $best_r = undef; if (defined $info->{seq_alph}) { if ($info->{seq_alph}->has_complement()) { # reverse but don't complement $best_r = reverse $best_f; } } elsif ($alph->has_complement()) { # reverse complement $best_r = $alph->rc_seq($best_f); } # create a motif object $alt = "-" unless (defined $alt); # Make sure "alt" is defined. my $motif = { db => $db, id => $id, alt => $alt, bad => $bad, len => $len, best_f => $best_f, best_r => $best_r }; push(@{$info->{motifs}}, $motif); } sub _end_motifs { my ($info) = @_; } sub _start_correlations { my ($info) = @_; $info->{correlation} = {}; } sub _handle_correlation { my ($info, $idx_a, $idx_b, $value) = @_; if ($idx_a < $idx_b) { my $tmp = $idx_a; $idx_a = $idx_b; $idx_b = $tmp; } my $set = $info->{correlation}; $set->{$idx_a} = {} unless defined $set->{$idx_a}; $set->{$idx_a}->{$idx_b} = $value; } sub _end_correlations { my ($info) = @_; } sub _start_nos { my ($info) = @_; $info->{nos} = ''; } sub _handle_expect { my ($info, $gap, $idx) = @_; $info->{nos} .= '_' . $gap if defined $gap; $info->{nos} .= '_' if length($info->{nos}); $info->{nos} .= '[' . ($idx + 1) . ']'; } sub _end_nos { my ($info) = @_; } sub _start_sequence_dbs { my ($info) = @_; $info->{sequence_dbs} = []; } sub _handle_sequence_db { my ($info, $source, $name, $last_mod_date, $sequence_count, $residue_count) = @_; my $seqdb = { name => (defined $name ? $name : scalar fileparse($source)), timestamp => $last_mod_date, nseq => $sequence_count, nres => $residue_count }; push(@{$info->{sequence_dbs}}, $seqdb); } sub _end_sequence_dbs { my ($info) = @_; } sub _start_sequences { my ($info) = @_; $info->{matching_nseqs} = 0; $info->{postponed_scores} = []; } sub _start_sequence { my ($info, $db, $name, $comment, $length) = @_; $info->{matching_nseqs} += 1 if ($info->{strand_handling} ne 'separate'); my $seq = { db => $db, name => $name, comment => $comment, length => $length, scores => [], segs => [], hits => [] }; $info->{seq} = $seq; } sub _handle_score { my ($info, $strand, $combined_pvalue, $evalue, $frame) = @_; $info->{matching_nseqs} += 1 if ($info->{strand_handling} eq 'separate'); my $score = { strand => ($strand eq 'both' ? 0 : ($strand eq 'forward' ? 1 : -1)), pvalue => $combined_pvalue, evalue => $evalue, frame => defined $frame ? ord($frame) - ord('a') : undef }; push(@{$info->{seq}->{scores}}, $score); } sub _start_seg { my ($info, $start_pos) = @_; my $seg = { start_pos => $start_pos - 1, # change to zero indexed sequence => undef, hits => [] }; $info->{seg} = $seg; } sub _handle_data { my ($info, $sequence) = @_; $info->{seg}->{sequence} = $sequence; } sub _handle_hit { my ($info, $pos, $idx, $rc, $pvalue, $match, $translation) = @_; my $hit = { pos => $pos - 1, # change to zero indexed motif => $idx, rc => $rc, pvalue => $pvalue, match => $match, translation => $translation }; push(@{$info->{seg}->{hits}}, $hit); } sub _end_seg { my ($info) = @_; push(@{$info->{seq}->{segs}}, $info->{seg}); push(@{$info->{seq}->{hits}}, @{$info->{seg}->{hits}}); $info->{seg} = undef; } sub _end_sequence { my ($info) = @_; my $seq = $info->{seq}; $info->{seq} = undef; my ($good_score, $worse_score) = @{$seq->{scores}}; if (defined $worse_score) { if (&compare_scores($good_score, $worse_score) > 0) { my $tmp = $good_score; $good_score = $worse_score; $worse_score = $tmp; } } # process any better scores that were postponed if (scalar @{$info->{postponed_scores}} > 0) { @{$info->{postponed_scores}} = sort { &compare_scores($a, $b) } @{$info->{postponed_scores}}; my $i; for ($i = 0; $i < scalar @{$info->{postponed_scores}}; $i++) { my $score = $info->{postponed_scores}->[$i]; if (&compare_scores($score, $good_score) <= 0) { my $other_seq = &retrieve_seq($info, $score->{file_loc}); &process_seq($info, $other_seq, $score); } else { last; } } # remove the used postponed scores splice(@{$info->{postponed_scores}}, 0, $i) } # output the best existing score &process_seq($info, $seq, $good_score); if (defined $worse_score) { # store the sequence for later retrieval $worse_score->{file_loc} = &store_seq($info, $seq); push(@{$info->{postponed_scores}}, $worse_score); } } sub _end_sequences { my ($info) = @_; if (scalar @{$info->{postponed_scores}} > 0) { @{$info->{postponed_scores}} = sort { &compare_scores($a, $b) } @{$info->{postponed_scores}}; for (my $i = 0; $i < scalar @{$info->{postponed_scores}}; $i++) { my $score = $info->{postponed_scores}->[$i]; my $seq = &retrieve_seq($info, $score->{file_loc}); &process_seq($info, $seq, $score); } @{$info->{postponed_scores}} = (); } } sub _handle_runtime { my ($info, $host, $when, $cycles, $seconds) = @_; $info->{host} = $host; $info->{runtime} = $seconds; } sub _end_mast { my ($info) = @_; } sub load_xml { my ($xml_path) = @_; my $data = { xnum => 1 }; my $sax = new MastSAX($data, start_mast => \&_start_mast, handle_command_line => \&_handle_command_line, handle_settings => \&_handle_settings, handle_alphabet => \&_handle_alphabet, handle_sequence_alphabet => \&_handle_sequence_alphabet, handle_translate => \&_handle_translate, handle_background => \&_handle_background, start_motif_dbs => \&_start_motif_dbs, handle_motif_db => \&_handle_motif_db, end_motif_dbs => \&_end_motif_dbs, start_motifs => \&_start_motifs, handle_motif => \&_handle_motif, end_motifs => \&_end_motifs, start_correlations => \&_start_correlations, handle_correlation => \&_handle_correlation, end_correlations => \&_end_correlations, start_nos => \&_start_nos, handle_expect => \&_handle_expect, end_nos => \&_end_nos, start_sequence_dbs => \&_start_sequence_dbs, handle_sequence_db => \&_handle_sequence_db, end_sequence_dbs => \&_end_sequence_dbs, start_sequences => \&_start_sequences, start_sequence => \&_start_sequence, handle_score => \&_handle_score, start_seg => \&_start_seg, handle_data => \&_handle_data, handle_hit => \&_handle_hit, end_seg => \&_end_seg, end_sequence => \&_end_sequence, end_sequences => \&_end_sequences, handle_runtime => \&_handle_runtime, end_mast => \&_end_mast ); my $fh; sysopen($fh, $xml_path, O_RDONLY) or confess("Failed to open file \"$xml_path\"\n"); while (<$fh>) { $sax->parse_more($_); if ($sax->has_errors()) { confess("Failed to write text output due to errors processing the XML:\n" . join("\n", $sax->get_errors())); } } $sax->parse_done(); if ($sax->has_errors()) { confess("Failed to write text output due to errors processing the XML:\n" . join("\n", $sax->get_errors())); } return $data; } sub write_text { my ($txt_path, $doc, $data) = @_; my $stars = ("*" x 80) . "\n"; my $alph = $data->{alph}; my $seq_alph = (defined($data->{seq_alph}) ? $data->{seq_alph} : $alph); my $seq_alph_name = ( $seq_alph->is_dna() ? "nucleotide" : ( $seq_alph->is_protein() ? "peptide" : $seq_alph->name())); my $motif_alph_name = ( $alph->is_dna() ? "nucleotide" : ( $alph->is_protein() ? "peptide" : $alph->name())); my $fh; sysopen($fh, $txt_path, O_WRONLY | O_CREAT | O_TRUNC) or confess("Failed to open file \"$txt_path\" for writing\n"); print $fh $stars, "MAST - Motif Alignment and Search Tool\n", $stars, "\tMAST version $data->{version} (Release date: $data->{release})\n\n", "\tFor further information on how to interpret these results or to get\n", "\ta copy of the MAST software please access http://meme-suite.org .\n", $stars, "\n\n", $stars, "REFERENCE\n", $stars, "\tIf you use this program in your research, please cite:\n\n", "\tTimothy L. Bailey and Michael Gribskov,\n", "\t\"Combining evidence using p-values: application to sequence homology\n", "\tsearches\", Bioinformatics, 14(48-54), 1998.\n", $stars, "\n\n", $stars, "DATABASE AND MOTIFS\n", $stars; # print info on sequence databases for (my $i = 0; $i < scalar(@{$data->{sequence_dbs}}); $i++) { my $seqdb = $data->{sequence_dbs}->[$i]; print $fh "\tDATABASE $seqdb->{name} ($seq_alph_name)\n", "\tLast updated on $seqdb->{timestamp}\n", "\tDatabase contains $seqdb->{nseq} sequences, $seqdb->{nres} residues\n\n"; } # print handling of strands # combine|separate|norc|unstranded if ($data->{strand_handling} eq 'combine') { print $fh "\tScores for positive and reverse complement strands are combined.\n\n"; } elsif ($data->{strand_handling} eq 'separate') { print $fh "\tPositive and reverse complement strands are scored separately.\n\n"; } elsif ($data->{strand_handling} eq 'norc') { print $fh "\tReverse complement strands are not scored.\n\n"; } # get the maximum widths for the ID and ALT_ID my $m_width = 5; my $id_width = 2; my $alt_id_width = 6; my $w_width = 5; my $bpm_width = 19; for (my $j = 0; $j < scalar(@{$data->{motifs}}); $j++) { my $motif = $data->{motifs}->[$j]; if (defined $motif->{id} && length($motif->{id}) > $id_width) { $id_width = length($motif->{id}); } if (defined $motif->{alt} && length($motif->{alt}) > $alt_id_width) { $alt_id_width = length($motif->{alt}); } } # print info on the motifs for (my $i = 0; $i < scalar(@{$data->{motif_dbs}}); $i++) { my $motifdb = $data->{motif_dbs}->[$i]; print $fh "\tMOTIFS $motifdb->{name} ($motif_alph_name)\n"; #print $fh "\tMOTIF WIDTH BEST POSSIBLE MATCH\n"; printf $fh "\t%-*s %-*s %-*s %-*s %-*s\n", $m_width, "MOTIF", $id_width, "ID", $alt_id_width, "ALT ID", $w_width, "WIDTH", $bpm_width, "BEST POSSIBLE MATCH"; printf $fh "\t%s %s %s %s %s\n", '-' x $m_width, '-' x $id_width, '-' x $alt_id_width, '-' x $w_width, '-' x $bpm_width; for (my $j = 0; $j < scalar(@{$data->{motifs}}); $j++) { my $motif = $data->{motifs}->[$j]; # skip over motifs not for this database next if $motif->{db} != $i; next if ($motif->{bad} && $data->{remove_correlated}); #printf $fh "\t%3d %3d %s\n", ($j + 1), $motif->{len}, $motif->{best_f}; printf $fh "\t%*d %-*s %-*s %*d %-s\n", $m_width, ($j + 1), $id_width, $motif->{id}, $alt_id_width, $motif->{alt}, $w_width, $motif->{len}, $motif->{best_f}; } print $fh "\n"; } # print order and spacing of motifs print $fh "\tNominal ordering and spacing of motifs:\n\t $data->{nos}\n\n" if (defined $data->{nos}); # print pairwise correlations as long as there are at least 2 motifs my $bad_motif_count = 0; for (my $i = 0; $i < scalar(@{$data->{motifs}}); $i++) { $bad_motif_count++ if $data->{motifs}->[$i]->{bad}; } if ((!$data->{remove_correlated} && scalar(@{$data->{motifs}}) >= 2) || (scalar(@{$data->{motifs}}) - $bad_motif_count >= 2)) { print $fh "\tPAIRWISE MOTIF CORRELATIONS:\n", "\tMOTIF"; for (my $i = 0; $i < scalar(@{$data->{motifs}}) - 1; $i++) { my $motif = $data->{motifs}->[$i]; next if ($data->{remove_correlated} && $motif->{bad}); printf $fh " %5d", ($i + 1); } print $fh "\n\t-----"; for (my $i = 0; $i < scalar(@{$data->{motifs}}) - 1; $i++) { my $motif = $data->{motifs}->[$i]; next if ($data->{remove_correlated} && $motif->{bad}); printf $fh " -----"; } print $fh "\n"; for (my $i = 1; $i < scalar(@{$data->{motifs}}); $i++) { my $from = $data->{motifs}->[$i]; next if ($data->{remove_correlated} && $from->{bad}); printf $fh "\t %2d ", ($i + 1); for (my $j = 0; $j < $i; $j++) { my $to = $data->{motifs}->[$j]; next if ($data->{remove_correlated} && $to->{bad}); printf $fh " %5.2f", $data->{correlation}->{$i}->{$j}; } print $fh "\n"; } if ($bad_motif_count) { printf $fh "\tCorrelations above %4.2f may cause some combined p-values and\n" . "\tE-values to be underestimates.\n", $data->{max_correlation}; if ($data->{remove_correlated}) { print $fh "\tRemoved motif"; } else { print $fh "\tRemoving motif"; } print $fh ($bad_motif_count > 1 ? "s " : " "); for (my $i = 0, my $j = 0; $i < scalar(@{$data->{motifs}}); $i++) { my $motif = $data->{motifs}->[$i]; next unless $motif->{bad}; if (($j + 1) == $bad_motif_count) { print $fh " and " if $j > 0; } else { print $fh ", " if $j > 0; } print $fh ($i + 1); $j++; } if ($data->{remove_correlated}) { printf $fh " because they have correlation > %4.2f\n\twith the remaining motifs.\n\n", $data->{max_correlation}; } else { print $fh " from the query may be advisable.\n\n"; } } else { printf $fh "\tNo overly similar pairs (correlation > %4.2f) found.\n\n", $data->{max_correlation}; } } # print background frequencies if ($data->{bg_source} ne 'sequence') { printf $fh "\tRandom model letter frequencies (from %s):", $data->{bg_source} eq "--nrdb--" ? "non-redundant database" : $data->{bg_source}; for (my $i = 0, my $pcol = 80; $i < $alph->size_core(); $i++) { $pcol += 8; if ($pcol >= 80) { $pcol = 15; print $fh "\n\t"; } printf $fh "%s %5.3f ", $alph->char($i), $data->{bg_freqs}->[$i]; } print $fh "\n"; } else { print $fh "\tUsing random model based on each target sequence composition.\n"; } # end database and motif section print $fh $stars; # print table of hits documentation print $fh "\n\n", $stars, "SECTION I: HIGH-SCORING SEQUENCES\n", $stars; if ($doc) { printf $fh "\t- Each of the following %d sequences has E-value less than %g.\n", $data->{matching_nseqs}, $data->{max_seq_evalue}; print $fh "\t- The E-value of a sequence is the expected number of sequences\n", "\t in a random database of the same size that would match the motifs as\n", "\t well as the sequence does and is equal to the combined p-value of the\n", "\t sequence times the number of sequences in the database.\n", "\t- The combined p-value of a sequence measures the strength of the\n", "\t match of the sequence to all the motifs and is calculated by\n", "\t o finding the score of the single best match of each motif\n", "\t to the sequence (best matches may overlap),\n", "\t o calculating the sequence p-value of each score,\n", "\t o forming the product of the p-values,\n"; if (defined $data->{nos}) { print $fh "\t o multiplying by the p-value of the observed spacing of\n", "\t pairs of adjacent motifs (given the nominal spacing),\n"; } my $fs1 = ""; my $fs2 = ""; if (defined $data->{seq_alph}) { if ($data->{strand_handling} eq 'separate') { $fs1 = "\t- The strand and frame of the (best) motif match(es) is shown.\n"; } else { $fs1 = "\t- The frame of the (best) motif match(es) is shown.\n"; } $fs2 = "\t Frames 1, 2, and 3 are labeled a, b c, respectively.\n" } print $fh "\t o taking the p-value of the product.\n", "\t- The sequence p-value of a score is defined as the\n", "\t probability of a random sequence of the same length containing\n", "\t some match with as good or better a score.\n", "\t- The score for the match of a position in a sequence to a motif\n", "\t is computed by by summing the appropriate entry from each column of\n", "\t the position-dependent scoring matrix that represents the motif.\n", "\t- Sequences shorter than one or more of the motifs are skipped.\n", $fs1, $fs2, "\t- The table is sorted by increasing E-value.\n", $stars, "\n"; } # print table of hits headings and data { my $x = defined $data->{seq_alph}; my $s = $data->{strand_handling} eq 'separate'; my $desc_len = $COMMENT_WIDTH - ($x ? ($FRAME_WIDTH + 1) : ($s ? ($STRAND_WIDTH + 1) : 0)); my $nf = '%-' . $NAME_WIDTH . 's'; my $df = '%-' . $desc_len . 's'; print $fh sprintf($nf,"SEQUENCE NAME")," ",sprintf($df,"DESCRIPTION")," ",($x?"FRAME ":($s?"STRAND ":"")),"E-VALUE LENGTH\n"; print $fh sprintf($nf,"-------------")," ",sprintf($df,"-----------")," ",($x?"----- ":($s?"------ ":"")),"-------- ------\n"; if (defined $data->{hit_file}) { my $file = $data->{hit_file}; seek($file, 0, SEEK_SET); while (<$file>) { print $fh $_; } } print $fh "\n", $stars, "\n"; } # print table of diagrams documentation print $fh "\n\n", $stars, "SECTION II: MOTIF DIAGRAMS\n", $stars; if ($doc) { my $ptype = $data->{adj_hit_pvalue} ? "SEQUENCE" : "POSITION"; my $hit_pvalue = sprintf("%g", $data->{max_hit_pvalue}); my $weak_pvalue = sprintf("%g", $data->{max_weak_pvalue}); my $issep = $data->{strand_handling} eq 'separate'; my $s = $data->{strand_handling} eq 'combine' ? 's' : ''; my $f = defined $data->{seq_alph} ? 'f' : ''; print $fh "\t- The ordering and spacing of all non-overlapping motif occurrences\n", "\t are shown for each high-scoring sequence listed in Section I.\n", "\t- A motif occurrence is defined as a position in the sequence whose\n", "\t match to the motif has $ptype p-value less than $weak_pvalue.\n", "\t- The $ptype p-value of a match is the probability of\n", ($data->{adj_hit_pvalue} ? ("\t some random subsequence in a set of n,\n", "\t where n is the sequence length minus the motif width plus 1,\n") : ("\t a single random subsequence of the length of the motif\n") ), "\t scoring at least as well as the observed match.\n", "\t- For each sequence, all motif occurrences are shown unless there\n", "\t are overlaps. In that case, a motif occurrence is shown only if its\n", "\t p-value is less than the product of the p-values of the other\n", "\t (lower-numbered) motif occurrences that it overlaps.\n", "\t- The table also shows ", ($issep ? ("the strand and\n\t ") : ()), "the E-value of each sequence.\n", "\t- Spacers and motif occurences are indicated by\n", "\t o -d- `d' residues separate the end of the preceding motif \n", "\t\t occurrence and the start of the following motif occurrence\n", "\t o [${s}n$f] occurrence of motif `n' with p-value less than $hit_pvalue", (defined $data->{seq_alph} ? ("\n\t\t in frame f. Frames 1, 2, and 3 are labeled a, b c.\n") : (".\n") ), ($data->{strand_handling} eq 'combine' ? ("\t\t A minus sign indicates that the occurrence is on the\n", "\t\t reverse complement strand.\n") : () ), ($hit_pvalue < $weak_pvalue ? ( "\t o <${s}n$f> occurrence of motif `n' with $hit_pvalue < p-value < $weak_pvalue", (defined $data->{seq_alph} ? ("\n\t\t in frame f. Frames 1, 2, and 3 are labeled a, b c.\n") : (".\n") ), ($data->{strand_handling} eq 'combine' ? ("\t\t A minus sign indicates that the occurrence is on the\n", "\t\t reverse complement strand.\n") : () ) ) : ()), $stars, "\n"; } # print table of diagrams headings and data { my $nf = "%-". $NAME_WIDTH . "s"; my $s = $data->{strand_handling} eq 'separate'; print $fh sprintf($nf,"SEQUENCE NAME"), ($s?" STRAND":""), " E-VALUE MOTIF DIAGRAM\n"; print $fh sprintf($nf,"-------------"), ($s?" ------":""), " -------- -------------\n"; if (defined $data->{diag_file}) { my $file = $data->{diag_file}; seek($file, 0, SEEK_SET); while (<$file>) { print $fh $_; } } print $fh "\n", $stars, "\n"; } # print table of annotated sequences documentation and data print $fh "\n\n", $stars, "SECTION III: ANNOTATED SEQUENCES\n", $stars; if ($doc) { my $ptype = $data->{adj_hit_pvalue} ? "SEQUENCE" : "POSITION"; my $hit_pvalue = sprintf("%g", $data->{max_hit_pvalue}); my $weak_pvalue = sprintf("%g", $data->{max_weak_pvalue}); my $t = defined $data->{seq_alph}; my $c = $data->{strand_handling} eq 'combine'; print $fh "\t- The positions and p-values of the non-overlapping motif occurrences\n", "\t are shown above the actual sequence for each of the high-scoring\n", "\t sequences from Section I.\n", "\t- A motif occurrence is defined as a position in the sequence whose\n", "\t match to the motif has $ptype p-value less than $weak_pvalue as \n", "\t defined in Section II.\n", "\t- For each sequence, the first line specifies the name of the sequence.\n", "\t- The second (and possibly more) lines give a description of the \n", "\t sequence.\n", "\t- Following the description line(s) is a line giving the length, \n", "\t combined p-value, and E-value of the sequence as defined in Section I.\n", "\t- The next line reproduces the motif diagram from Section II.\n", "\t- The entire sequence is printed on the following lines.\n", "\t- Motif occurrences are indicated directly above their positions in the\n", "\t sequence on lines showing\n", "\t o the motif number", ($t ? ("/frame") : ()), " of the occurrence", ($c ? (" (a minus sign indicates that\n", "\t the occurrence is on the reverse complement strand)") : ()),",\n", "\t o the position p-value of the occurrence,\n", "\t o the best possible match to the motif", ($t && $c ? (" (or its reverse)") : ($c ? (" (or its reverse complement)") : ())), ($t ? (",") : (", and")), "\n", "\t o columns whose match to the motif has a positive score (indicated \n", "\t by a plus sign)", ($t && $c ? (", and\n", "\t o the protein translation of the match (or its reverse).\n") : ($t ? (", and\n", "\t o the protein translation of the match.\n") : (".\n"))), $stars; } if (defined $data->{note_file}) { my $file = $data->{note_file}; seek($file, 0, SEEK_SET); while (<$file>) { print $fh $_; } } print $fh "\n", $stars, "\n\n", "CPU: ", $data->{host}, "\n", "Time ", sprintf("%s", $data->{runtime}), " secs.\n\n", join(" ", @{$data->{args}}), "\n"; close($fh); } sub compare_scores { my ($score1, $score2) = @_; if ($score1->{evalue} < $score2->{evalue}) { return -1; } elsif ($score1->{evalue} > $score2->{evalue}) { return 1; } elsif ($score1->{pvalue} < $score2->{pvalue}) { return -1; } elsif ($score1->{pvalue} > $score2->{pvalue}) { return 1; } elsif ($score1->{strand} > $score2->{strand}) { return -1; } elsif ($score1->{strand} < $score2->{strand}) { return 1; } elsif (defined($score1->{frame}) && defined($score2->{frame})) { if ($score1->{frame} < $score2->{frame}) { return -1; } elsif ($score1->{frame} > $score2->{frame}) { return 1; } else { return 0; } } elsif (defined($score1->{frame})) { return -1; } elsif (defined($score2->{frame})) { return 1; } else { return 0; } } # # Internal Only Function # # systell( # $file_handle_ref # ); # # Given a file returns the current position in the file. # # Since I'm using syswrite then I must use sysseek # to get the file position instead of tell. # # Returns: # 1) The current file position or undefined on failure. # sub systell { return (sysseek($_[0], 0, SEEK_CUR) or confess("sysseek (for systell implementation) failed!")); } # # Internal Only Function # # syswrite_int( # $file_handle_ref, # $int # ); # # Given a file and a int writes the int to the file. # # As I couldn't figure out the docs on write I used syswrite. # # Returns nothing, though I should change this to check for error states. # sub syswrite_int { confess("Missing arguments") unless defined $_[0] && defined $_[1]; confess("Not an int!") unless (int($_[1]) == $_[1]); syswrite($_[0], pack('l', $_[1])) or confess("syswrite failed!"); } # # Internal Only Function # # syswrite_dbl( # $file_handle_ref, # $double # ); # # Given a file and a double writes the double to the file. # # As I couldn't figure out the docs on write I used syswrite. # # Returns nothing, though I should change this to check for error states. # sub syswrite_dbl { confess("Missing arguments") unless defined $_[0] && defined $_[1]; syswrite($_[0], pack('d', $_[1])) or confess("syswrite failed!"); } # # Internal Only Function # # syswrite_string( # $file_handle_ref, # $string # ); # # Given a file and a string writes the string to the file. # # As I couldn't figure out the docs on write I used syswrite. # This converts the string to utf 8 and writes out its length # in bigendian 32-bit int followed by the utf-8 octlets. # # Returns nothing, though I should change this to check for error states. # sub syswrite_string { confess("Missing arguments") unless defined $_[0] && defined $_[1]; my $octlets = encode_utf8($_[1]); my $data = pack('la*', length($octlets), $octlets); syswrite($_[0], $data) or confess("syswrite failed!"); } # # Internal Only Function # # read_dbl ( # $file_handle_ref # ); # # Given a file, read a double. # # Returns: # 1) the double # sub read_dbl { my $data; my $read_len = sysread($_[0], $data, $DBL_SIZE); confess("read failed!") if ($read_len != $DBL_SIZE); my $num = unpack('d', $data); return $num; } # # Internal Only Function # # read_int ( # $file_handle_ref # ); # # Given a file, read a integer. # # Returns: # 1) the integer # sub read_int { my $data; my $read_len = sysread($_[0], $data, $INT_SIZE); confess("Read int failed! Read $read_len bytes but expected $INT_SIZE bytes.") if ($read_len != $INT_SIZE); my $num = unpack('l', $data); return $num; } # # Internal Only Function # # read_string( # $file_handle_ref # ); # # Given a file, read a string. # # Reads a string written using syswrite_string. # A string consists of: # # # Returns: # 1) The string. # sub read_string { my $len = read_int(@_); if ($len > 0) { my $octlets; my $read_len = sysread($_[0], $octlets, $len, 0); confess("Read string failed! Read $read_len bytes but exptected $len bytes.") if ($read_len != $len); my $str = decode_utf8($octlets); return $str; } return ''; } sub store_seq { my ($info, $seq) = @_; unless (defined $info->{store}) { $info->{store} = tempfile(); binmode $info->{store}; } my $fh = $info->{store}; my $file_pos = sysseek($fh, 0, SEEK_END); confess("store_seq failure: sysseek to end of file failed") unless $file_pos; &syswrite_int($fh, $seq->{db}); &syswrite_string($fh, $seq->{name}); &syswrite_string($fh, $seq->{comment}); &syswrite_int($fh, $seq->{length}); &syswrite_int($fh, scalar @{$seq->{scores}}); for (my $i = 0; $i < scalar @{$seq->{scores}}; $i++) { my $score = $seq->{scores}->[$i]; &syswrite_int($fh, $score->{strand}); &syswrite_dbl($fh, $score->{pvalue}); &syswrite_dbl($fh, $score->{evalue}); &syswrite_int($fh, (defined $score->{frame} ? $score->{frame} : -1)); } &syswrite_int($fh, scalar @{$seq->{segs}}); for (my $i = 0; $i < scalar @{$seq->{segs}}; $i++) { my $seg = $seq->{segs}->[$i]; &syswrite_int($fh, $seg->{start_pos}); &syswrite_string($fh, $seg->{sequence}); &syswrite_int($fh, scalar @{$seg->{hits}}); for (my $j = 0; $j < scalar(@{$seg->{hits}}); $j++) { my $hit = $seg->{hits}->[$j]; &syswrite_int($fh, $hit->{pos}); &syswrite_int($fh, $hit->{motif}); &syswrite_int($fh, $hit->{rc}); &syswrite_dbl($fh, $hit->{pvalue}); &syswrite_string($fh, $hit->{match}); &syswrite_string($fh, (defined $hit->{translation} ? $hit->{translation} : '')); } } return $file_pos; } sub retrieve_seq { my ($info, $file_pos) = @_; confess("No file position specified!") unless defined $file_pos; confess("No storage file initilised!") unless defined $info->{store}; my $fh = $info->{store}; sysseek($fh, $file_pos, SEEK_SET) or confess("retrieve_seq failure: sysseek to file offset $file_pos failed."); my $seq = { db => &read_int($fh), name => &read_string($fh), comment => &read_string($fh), length => &read_int($fh), scores => [], segs => [], hits => [] }; my $score_count = &read_int($fh); for (my $i = 0; $i < $score_count; $i++) { my $score = { strand => &read_int($fh), pvalue => &read_dbl($fh), evalue => &read_dbl($fh), frame => &read_int($fh) }; $score->{frame} = undef if ($score->{frame} == -1); push(@{$seq->{scores}}, $score); } my $seg_count = &read_int($fh); for (my $i = 0; $i < $seg_count; $i++) { my $seg = { start_pos => &read_int($fh), sequence => &read_string($fh), hits => [] }; my $hit_count = &read_int($fh); for (my $j = 0; $j < $hit_count; $j++) { my $hit = { pos => &read_int($fh), motif => &read_int($fh), rc => &read_int($fh), pvalue => &read_dbl($fh), match => &read_string($fh), translation => &read_string($fh) }; $hit->{translation} = undef if ($hit->{translation} eq ''); push(@{$seg->{hits}}, $hit); } push(@{$seq->{segs}}, $seg); push(@{$seq->{hits}}, @{$seg->{hits}}); } return $seq; } sub make_block { my ($info, $hit, $print_pvalue) = @_; my $pv = $hit->{pvalue}; my $ok_pv = $info->{max_hit_pvalue}; my $pvstr = ($print_pvalue ? sprintf("(%8.2e)", $pv) : ''); my $lbracket = $pv < $ok_pv ? '[' : '<'; my $rbracket = $pv < $ok_pv ? ']' : '>'; my $strand = ($info->{strand_handling} eq 'combine' ? ($hit->{rc} ? '-' : '+') : ''); my $motif = $hit->{motif} + 1; my $frame = (defined $info->{seq_alph} ? chr(ord('a') + ($hit->{pos} % $info->{xnum})) : ''); return $lbracket . $strand . $motif . $frame . $pvstr . $rbracket; } sub make_diagram { my ($info, $seq, $score) = @_; my $out = ''; my $dash = "-"; my $prev_hit = undef; for (my $i = 0; $i < scalar @{$seq->{hits}}; $i++) { my $hit = $seq->{hits}->[$i]; next if ($score->{strand} != 0 && $hit->{rc} == ($score->{strand} == 1)); if (defined $prev_hit) { my $ws = $info->{motifs}->[$prev_hit->{motif}]->{len} * $info->{xnum}; my $gap = $hit->{pos} - ($prev_hit->{pos} + $ws); if ($gap > 0) { $out .= $dash . $gap . $dash; } else { $out .= $dash; } } else { # first hit my $gap = $hit->{pos}; $out .= $gap . $dash if $gap > 0; } $out .= &make_block($info, $hit, 0); $prev_hit = $hit; } if (defined $prev_hit) { my $ws = $info->{motifs}->[$prev_hit->{motif}]->{len} * $info->{xnum}; my $gap = $seq->{length} - ($prev_hit->{pos} + $ws); $out .= $dash . $gap if $gap > 0; } else { my $gap = $seq->{length}; $out .= $seq->{length}; } } sub print_diagram { my ($fh, $diagram, $header) = @_; my $diagram_len = length($diagram); my $header_len = length($header); for (my $i = 0; $i < $diagram_len;) { print $fh ($i == 0 ? $header : ' ' x $header_len); my $remain = $diagram_len - $i; my $print_len = $PAGE_WIDTH - $header_len - 6; if ($remain <= $PAGE_WIDTH - $header_len) { $print_len = $remain; } else { for (; $print_len < $remain; $print_len++) { my $c = substr($diagram, $i + $print_len, 1); last if ($c eq '_' || $c eq ','); } } print $fh substr($diagram, $i, $print_len), "\n"; $i += $print_len; } } sub space_seq { my ($info, $best, $joiner) = @_; return undef unless defined $best; return $best unless $info->{xnum} > 1; my @parts = split('', $best); push(@parts, ''); return join($joiner x ($info->{xnum} - 1), @parts); } sub rtrim { my ($str) = @_; $str =~ s/\s+$//; return $str; } sub process_seq { my ($info, $seq, $score) = @_; my $kmotifs = 0; # disabled my $x = defined $info->{seq_alph}; my $s = $info->{strand_handling} eq 'separate'; my $diagram = make_diagram($info, $seq, $score); my $strand = ($s ? ($score->{strand} == -1 ? "-" : "+") : ""); my $frame = ($x ? chr(ord('a') + $score->{frame}) : ""); my $strame = $strand . $frame; my $strame_len = ($x ? $FRAME_WIDTH + 1 : ($s ? $STRAND_WIDTH + 1 : 0)); # print the hit in the hits section. { $info->{hit_file} = tempfile() unless (defined $info->{hit_file}); my $fh = $info->{hit_file}; my $desc_len = $COMMENT_WIDTH - $strame_len; my $elipsis = ''; if (length($seq->{comment}) > $desc_len) { $elipsis = "..."; $desc_len -= 3; } printf $fh "%-*.*s %-*.*s%s%*s %8.2g %6d\n", $NAME_WIDTH, $NAME_WIDTH, $seq->{name}, $desc_len, $desc_len, $seq->{comment}, $elipsis, $strame_len, $strame, $score->{evalue}, $seq->{length}; } # print the motif diagram in the diagrams section { $info->{diag_file} = tempfile() unless (defined $info->{diag_file}); my $fh = $info->{diag_file}; my $header = sprintf "%-*.*s%*s %8.2g ", $NAME_WIDTH, $NAME_WIDTH, $seq->{name}, ($s? $STRAND_WIDTH + 1 : 0), $strand, $score->{evalue}; &print_diagram($fh, $diagram, $header); } # print the note file { $info->{note_file} = tempfile() unless (defined $info->{note_file}); my $fh = $info->{note_file}; my $strand2 = ($s ? ($score->{strand} == -1 ? " (- strand)" : " (+ strand)") : ""); print $fh "\n\n", $seq->{name}, $strand2, "\n"; my $comment_lines = 0; my $comment_len = length($seq->{comment}); my $offset = 0; my $good_break = 0; for (my $offset = 0, my $good_break = 0; $offset < $comment_len; $offset += $good_break + 1, $good_break = 0) { for (my $i = 0; $i < $PAGE_WIDTH - 2; $i++) { my $pos = $offset + $i; $good_break = $i if ($pos >= $comment_len || substr($seq->{comment}, $pos, 1) eq ' '); last if $pos >= $comment_len; } $good_break = $PAGE_WIDTH - 3 if ($good_break == 0); print $fh ' ', substr($seq->{comment}, $offset, $good_break + 1), "\n"; $comment_lines++; if ($comment_lines >= $MAX_COMMENT_LINES) { print $fh " ...\n"; last; } } # print th elength, combined p-value and E-value printf $fh " LENGTH = %d COMBINED P-VALUE = %8.2e E-VALUE = %8.2g\n", $seq->{length}, $score->{pvalue}, $score->{evalue}; # print the motif diagram print_diagram($fh, $diagram, " DIAGRAM: "); my ($num_ln, $pv_ln, $best_ln, $match_ln, $tr_ln) = ('','','','',''); my $cols = $PAGE_WIDTH - 5; my $spacer = ' ' x 5; for (my $i = 0; $i < scalar(@{$seq->{segs}}); $i++) { my $seg = $seq->{segs}->[$i]; my $start = $seg->{start_pos}; my $end = $seg->{start_pos} + length($seg->{sequence}); for (my $pos = $start, my $hit_i = 0; $pos < $end; $pos += $cols) { for (;$hit_i < scalar @{$seg->{hits}}; $hit_i++) { my $hit = $seg->{hits}->[$hit_i]; last if ($hit->{pos} >= ($pos + $cols)); next if ($score->{strand} != 0 && $hit->{rc} == ($score->{strand} == 1)); my $motif = $info->{motifs}->[$hit->{motif}]; $num_ln .= ' ' x (($hit->{pos} - $pos) - length($num_ln)) if (length($num_ln) < ($hit->{pos} - $pos)); $num_ln .= &make_block($info, $hit); $pv_ln .= ' ' x (($hit->{pos} - $pos) - length($pv_ln)) if (length($pv_ln) < ($hit->{pos} - $pos)); $pv_ln .= sprintf("%.1e", $hit->{pvalue}); $best_ln .= ' ' x (($hit->{pos} - $pos) - length($best_ln)) if (length($best_ln) < ($hit->{pos} - $pos)); $best_ln .= &space_seq($info, $hit->{rc} ? $motif->{best_r} : $motif->{best_f}, '.'); $match_ln .= ' ' x (($hit->{pos} - $pos) - length($match_ln)) if (length($match_ln) < ($hit->{pos} - $pos)); $match_ln .= &space_seq($info, $hit->{match}, ' '); if (defined $info->{seq_alph}) { $tr_ln .= ' ' x (($hit->{pos} - $pos) - length($tr_ln)) if (length($tr_ln) < ($hit->{pos} - $pos)); $tr_ln .= &space_seq($info, $hit->{translation}, '.'); } } # print only if there is something on the displayed strand if (length $best_ln > 0) { my $posstr = sprintf("%-5d", ($pos + 1)); print $fh "\n", $spacer, (length $num_ln > $cols ? substr($num_ln, 0, $cols) : $num_ln), "\n", $spacer, (length $pv_ln > $cols ? substr($pv_ln, 0, $cols) : $pv_ln), "\n", $spacer, (length $best_ln > $cols ? substr($best_ln, 0, $cols) : $best_ln), "\n", $spacer, rtrim(length $match_ln > $cols ? substr($match_ln, 0, $cols) : $match_ln), "\n", (defined $info->{seq_alph} ? ($spacer, (length $tr_ln > $cols ? substr($tr_ln, 0, $cols) : $tr_ln), "\n") : ()), $posstr, substr($seg->{sequence}, $pos - $start, $cols), "\n"; } # advance past the printed portion $num_ln = (length $num_ln > $cols ? substr($num_ln, $cols) : ''); $pv_ln = (length $pv_ln > $cols ? substr($pv_ln, $cols) : ''); $best_ln = (length $best_ln > $cols ? substr($best_ln, $cols) : ''); $match_ln = (length $match_ln > $cols ? substr($match_ln, $cols) : ''); $tr_ln = (length $tr_ln > $cols ? substr($tr_ln, $cols) : ''); } } } } sub cleanup { my ($data) = @_; close($data->{hit_file}) if defined $data->{hit_file}; close($data->{diag_file}) if defined $data->{diag_file}; close($data->{note_file}) if defined $data->{note_file}; close($data->{store}) if defined $data->{store}; } sub main { &initialise(); my $opts = &arguments(); # load MAST XML my $data = &load_xml($opts->{XML_PATH}); # write MAST text &write_text($opts->{TXT_PATH}, $opts->{DOC}, $data); # close temporary files &cleanup($data); } &main(); 1;