#!/usr/bin/perl -w # AUTHOR: Philip Machanick # CREATE DATE: 20 July 2009 # Convert a Hartemink PSP file into MEME PSP format # Hartemink format: # For each FASTA sequence s[1..L]: Hartemink prior file contains a name line based on # the name of the FASTA sequence followed by p[1..2L], where p[1..L] are positive strand # priors and p[L+1..Ln] are negative strand priors (complement of s[L..1] = reverse # complement of s[1..L]); in both directions last w-1 positions are 0 for priors # calculated with motif width w; the prior values are scores p[i,j] representing the # probability of a motif starting at location j in sequence i. # MEME format: # Assumes reverse-stranded case is symmetric so only contains priors for one direction. # For each FASTA sequence s[1..L]: MEME prior file contains p[1..L] for each sequence # in a FASTA-like format in which the name line also includes the width w and the epsilon # value used to scale score values > 0 and < 1. The values are a probability distribution # rather than scores, that sum to 1 for an oops model, and < 1 per sequence for a zoops # model. # FIXME: not yet implemented for TCM, for which the probabilities should sum to 1 over # all sequences. # reads stdin # writes stdout use strict; use Scalar::Util qw(looks_like_number); my $PGM = $0; # name of program $PGM =~ s#.*/##; # remove part up to last slash #@args = @ARGV; # arguments to program my @args = ("-width W"); $| = 1; # flush after all prints $SIG{'INT'} = \&cleanup; # interrupt handler # Note: so that interrupts work, always use for system calls: # if ($status = system("$command")) {&cleanup($status)} push(@INC, split(":", $ENV{'PATH'})); # look in entire path my $MIN_EPSILON = 1E-200; my $MAX_EPSILON = 0.1; my $DEFAULT_MOD = 'zoops'; my $usage = < 'zoops', 'oops' => 'oops', 'tcm' => 'tcm'); my $revcomp; my $model = 'zoops'; # get input arguments while ($#ARGV >= 0) { $_ = shift; if ($_ eq "-h") { # help &print_usage("$usage", 0); } elsif ($_ eq "-width") { $width = shift; &print_usage("$usage", 1) unless (&check_numeric($width, "int", 2)); } elsif ($_ eq "-epsilon") { $epsilon = shift; &print_usage("$usage", 1) unless (&check_numeric($epsilon, "float", $MIN_EPSILON, $MAX_EPSILON)); } elsif ($_ eq "-mod") { $model = shift; &print_usage("$usage", 1) unless (exists $models{$model}); die "tcm not implemented" if ($model eq 'tcm'); } elsif ($_ eq "-revcomp") { $revcomp = 1; } else { &print_usage("$usage", 1); } } # check if compulsory args are set, and set any missing optional args &print_usage("$usage", 1) unless (defined($width)); $model = $DEFAULT_MOD unless (defined($model)); $epsilon = $MIN_EPSILON unless (defined($epsilon)); # read in the prior data my %raw_seqs = &read_prior_file(\*STDIN); # convert reverse-complemented scores to single-stranded scores # must keep the scores in range (0..1) for later calculations if (defined($revcomp)) { # reduce the data to half the original size, making each element # the mean of the original location + the same location in the reverse comp direction foreach my $name (keys %raw_seqs) { my $data = $raw_seqs{$name}; my $L = @$data; # skip w-1 zeros at end though they are unchanged when averaged for (my $i = 0; $i < $L/2-$width+1; $i++) { # position in reverse strand takes into account w-1 zeros $$data[$i] += $$data[$L-$i-$width]; $$data[$i] /= 2; } # reset the length of the array to half the previous length $#$data = @$data/2-1; } } # now adjust from [0..1] to [epsilon..1-epsilon] foreach my $name (keys %raw_seqs) { my $data = $raw_seqs{$name}; for (my $i = 0; $i < @$data; $i++) { die "in `$name' bad data at [$i]: $$data[$i]" unless ($$data[$i] >= 0 && $$data[$i] <= 1); $$data[$i] = $epsilon + $$data[$i] * (1 - 2.0 * $epsilon); } } # now convert the raw scores to priors foreach my $name (keys %raw_seqs) { my $alpha = $model eq 'zoops' ? 1 : 0; my @K = (); print '>'.$name." $width epsilon = "; printf "%lG\n", $epsilon; my $nonzeros = @{$raw_seqs{$name}} - $width + 1; for (my $j = 0; $j < $nonzeros; $j++) { my $P_ij = ${$raw_seqs{$name}}[$j]; push(@K,$P_ij/(1-$P_ij)); $alpha += $K[-1]; # add on last K } foreach my $k_val (@K) { printf "%20.19lG ", $k_val / $alpha; } # now print terminating w-1 zeros for (my $i = 0; $i < $width - 2; $i++) { print "0 "; } print "0\n"; } ################################################################################ # Subroutines # ################################################################################ ################################################################################ # read_prior_file # # Read a PRIOR file and return the values in an associative array indexed # by sequence name (based on read_fasta_file). # Does NOT check validity of data to allow use for a range of prior types. # Only assumes a FASTA-like name line starting with >name followed by a # sequence of values that may contain spaces (unlike read_fasta_file, which # edits out spaces). # # $file open file pointer # ################################################################################ sub read_prior_file { my ($file) = @_; my ($name, $seq) = ("",""); my %seqs; # read the file while (<$file>) { if (/^>(\S+)\s/) { # found start of sequence if ($name) { # save previous sequence in array $_ = $seq; $seqs{$name} = [split]; } $name = $1; # save sequence name $seq = ""; } else { $seq .= $_; } } # read file if ($name && $seq) { # save very last sequence in array $_ = $seq; $seqs{$name} = [split]; } %seqs; # return associative array of names and arrays of values } # read_prior_file ################################################################################ # # check_numeric # # check if value is defined and numeric; type, max and min optional # if supplied type "int" signals check if truncated == original; any # other value for type means don't do this check # ################################################################################ sub check_numeric { my ($data, $type, $min, $max) = @_; if (defined($data)) { if (!looks_like_number($data)) { return 0; } if (defined($max) && $data > $max) { return 0; } if (defined($min) && $data < $min) { return 0; } if (defined($type) && $type eq "int") { return int($data) == $data; } # passed all tests here: return 1; } else { return 0; } } ################################################################################ # # print_usage # # Print the usage message and exit. # ################################################################################ sub print_usage { my($usage, $status) = @_; if (-c STDOUT) { # standard output is a terminal open(C, "| more"); print C $usage; close C; } else { # standard output not a terminal print STDERR $usage; } exit $status; } ################################################################################ # cleanup # # cleanup stuff # ################################################################################ sub cleanup { my($status, $msg) = @_; if ($status && "$msg") {print STDERR "$msg: $status\n";} exit($status); }