#!/usr/bin/perl -w # FILE: rna2meme # AUTHOR: James Johnson # CREATE DATE: 19/8/2010 # DESCRIPTION: Convert a RNA sequence to its binding motif in MEME format use warnings; use strict; use lib qw(/mit/meme_v4.11.4/lib/perl); use Alphabet qw(dna rna); use MotifUtils qw(matrix_to_intern intern_to_meme read_background_file); use Fcntl qw(O_RDONLY); use Getopt::Long; use List::Util qw(max min); use Pod::Usage; =head1 NAME rna2meme - takes a FASTA file with short RNA sequences and creates MEME motifs. =head1 SYNOPSIS rna2meme [options] Options: -rna Output RNA motifs instead of DNA motifs -seed_start starting offset of seed in RNA sequence, set to 0 to treat entire sequence as seed; default: 0 -seed_end ending offset of seed in RNA sequence; default: 0 -start starting offset in RNA sequence (inclusive); use negative numbers to count from end; default: 1 -end ending offset in RNA sequence (inclusive); use negative numbers to count from end; default: -1 -match count to assign to a match (complement) in the seed region default: 1 -wobble count to assign to a U for a G, or a G for a U in the seed region default: 0.1 -miss count to assign to a non-match non-wobble in the seed region default: 0.01 -other_count extra count added to match, wobble and misses in non-seed positions to reduce their contribution to the score; default: 0.5 -bg file with background frequencies of letters; default: uniform background -pseudo add times letter background to each frequency; default: 0 -logodds output the log-odds (PSSM) and frequency (PSPM) motifs; default: PSPM motif only -url website for the motif; The FASTA ID is substituted for MOTIF_NAME; The first word after the FASTA ID is substituted for MOTIF_AC; default: no url Convert an RNA sequence to its binding motif in MEME format. Writes standard output. =cut # Constants my $sites = 20; # Set option defaults my $is_rna = 0; my $start = 1; my $end = -1; my $seed_start = 0; my $seed_end = 0; my $match = 1; my $wobble = 0.1; my $miss = 0.01; my $other_count = 0.5; my $bg_file; my $pseudo_total = 0; my $print_logodds = 0; my $url_pattern = ""; my $fasta_file; GetOptions( "rna" => \$is_rna, "seed_start=i" => \$seed_start, "seed_end=i" => \$seed_end, "other_count=f" => \$other_count, "start=i" => \$start, "end=i" => \$end, "match=f" => \$match, "wobble=f" => \$wobble, "miss=f" => \$miss, "bg=s" => \$bg_file, "pseudo=f" => \$pseudo_total, "logodds" => \$print_logodds, "url=s" => \$url_pattern) or pod2usage(2); pod2usage("Option -seed_end may not be smaller than -seed_start.") if ($seed_end < $seed_start); pod2usage("Option -other_count must have a non-negative value.") if ($other_count < 0); pod2usage("Option -start must not be zero.") if ($start == 0); pod2usage("Option -end must not be zero.") if ($end == 0); pod2usage("Option -match must have a non-negative value.") if ($match < 0); pod2usage("Option -wobble must have a non-negative value.") if ($wobble < 0); pod2usage("Option -miss must have a non-negative value.") if ($miss < 0); pod2usage("Option -pseudo must have a non-negative value.") if ($pseudo_total < 0); pod2usage("A FASTA file must be specified.") unless @ARGV; $fasta_file = shift(@ARGV); pod2usage("Only one FASTA file may be specified.") if @ARGV; # read the background file my %bg = &read_background_file($is_rna ? &rna() : &dna(), $bg_file); my $fasta_fp; if ($fasta_file eq "-") { $fasta_fp = *STDIN; } else { sysopen($fasta_fp, $fasta_file, O_RDONLY); } my $line; my $id = ''; my $acc = ''; my @rest; my $seq = ''; my $num_motifs = 0; while ($line = <$fasta_fp>) { chomp($line); if ($line =~ m/^>(.*)$/) { process_seq() if $id; #$id = $1; $_ = $1; ($id, $acc, @rest) = split; $seq = ''; } else { $seq .= $line; } } process_seq() if $id; close($fasta_fp) unless ($fasta_file eq "-"); # process the RNA sequence sub process_seq { die("Missing sequence\n") unless $seq; die("Sequence contains invalid characters\n") unless $seq =~ m/^[ACGTU]+$/; my $len = length($seq); my $s_index = position_to_index($start, $len); my $e_index = position_to_index($end, $len); if ($s_index > $e_index) { my $temp = $s_index; $s_index = $e_index; $e_index = $temp; } my $sub_seq = substr($seq, $s_index, $e_index - $s_index + 1); my @letters = reverse(split(//, uc($sub_seq))); my $matrix = ''; $len = @letters; my $i_rev = $len; foreach my $letter (@letters) { my ($match1, $wobble1, $miss1) = ($match, $wobble, $miss); unless ($seed_start == 0 || ($i_rev >= $seed_start && $i_rev <= $seed_end)) { my $scale = 4 * $other_count; $match1 = ($match + $other_count) / $scale; $wobble1 = ($wobble + $other_count) / $scale; $miss1 = ($miss + $other_count) / $scale; } if ($letter eq 'A') { $matrix .= "$miss1 $miss1 $miss1 $match1\n"; } elsif ($letter eq 'C') { $matrix .= "$miss1 $miss1 $match1 $miss1\n"; } elsif ($letter eq 'G') { $matrix .= "$miss1 $match1 $miss1 $wobble1\n"; } elsif ($letter eq 'T' || $letter eq 'U') { $matrix .= "$match1 $miss1 $wobble1 $miss1\n"; } $i_rev--; } my $url = $url_pattern; $url =~ s/MOTIF_NAME/$id/g; $url =~ s/MOTIF_AC/$acc/g; my ($motif, $errors) = matrix_to_intern(\%bg, $matrix, 'row', $sites, $pseudo_total, id => "$id $acc", url => $url, strands => 1); print(STDERR join("\n", @{$errors}), "\n") if @{$errors}; print intern_to_meme($motif, $print_logodds, 1, !($num_motifs++)) if $motif; } # converts a relative position to an actual index sub position_to_index { my ($pos, $len) = @_; if ($pos < 0) { return max(0, $len + $pos); } else { return min($len, $pos) - 1; } }