#!/usr/bin/perl -w # # FILE: beeml2meme # AUTHOR: Timothy L. Bailey # CREATE DATE: Oct-23-2011 # DESCRIPTION: Convert a file containing list of TFBS motif matrices from BEEML # to MEME output format. BEEML format has the BEEML ID followed by the UNIPROBE ID. use warnings; use strict; use lib qw(/mit/meme_v4.11.4/lib/perl); use Alphabet qw(dna); use MotifUtils qw(matrix_to_intern intern_to_meme read_background_file); use Fcntl qw(O_RDONLY); use Getopt::Long; use Pod::Usage; =head1 SYNOPSIS beeml2meme [options] + Options: -bg file with background frequencies of letters default: uniform background -pseudo add times letter background to each frequency; default: 0 -sg specifify a file containing the contents of: http://the_brain.bwh.harvard.edu/uniprobe/browse.php and use the uniprobe ID as the alternate name -logodds print log-odds matrix, too; default: print frequency matrix only -url website for the motif; The UNIPROBE ID is substituted for MOTIF_NAME; default: no url =cut # Set option defaults my $bg_file; my $uniprobe_scrape; my $pseudo_total = 0; my $print_logodds = 0; my $url_pattern = ""; my @matrix_files; GetOptions("bg=s" => \$bg_file, "pseudo=f" => \$pseudo_total, "logodds" => \$print_logodds, "sg=s" => \$uniprobe_scrape, "url=s" => \$url_pattern) or pod2usage(2); #check matrix file pod2usage("A matrix file must be specified for the conversion.") unless @ARGV; (@matrix_files) = sort(@ARGV); #check pseudo total pod2usage("Option -pseudo must have a positive value.") if ($pseudo_total < 0); # read the background file my %bg = &read_background_file(&dna(), $bg_file); # load UNIPROBE IDs from scrape of http://the_brain.bwh.harvard.edu/uniprobe/browse.php my %id_index; if (defined($uniprobe_scrape)) { %id_index = (); my $sg; sysopen($sg, $uniprobe_scrape, O_RDONLY) || die ("Can't open $uniprobe_scrape.\n"); while (<$sg>) { chomp; next if (/^#/ || /^\s*$/); # skip comment, blank lines my @words = split (/\t/); my $protein = $words[0]; $protein =~ s/\s+$//; my $uniprobe_id = $words[1]; $uniprobe_id =~ s/\s+$//; my $article = $words[5]; # translate to BEEML names if ($article eq "Berger et al., Nat Biotech 2006 ") { $article = "nbt06"; } elsif ($article eq "Berger et al., Cell 2008 ") { $article = "cell08"; } elsif ($article eq "De Silva et al., PNAS 2008 ") { $article = "pnas08"; } elsif ($article eq "Pompeani et al., Mol Microbiol 2008 ") { $article = "mmb08"; } elsif ($article eq "Badis et al., Science 2009 ") { $article = "sci09"; } elsif ($article eq "Grove et al., Cell 2009 ") { $article = "cell09"; } elsif ($article eq "Zhu et al., Genome Res 2009 ") { $article = "gr09"; } elsif ($article eq "Lesch et al., Genes Dev 2009 ") { $article = "gd09"; } elsif ($article eq "Scharer et al., Cancer Res 2009 ") { $article = "cr09"; } elsif ($article eq "Wei et al., EMBO J 2010 ") { $article = "embo10"; } else { die("Unknown article: $article\n"); } $id_index{$article . $protein} = $uniprobe_id; } close($sg); } # Open the matrix file for reading. my $count = 0; my $num_bad = 0; foreach my $matrix_file (@matrix_files) { my $matrix_fp; sysopen($matrix_fp, $matrix_file, O_RDONLY) || die("Can't open $matrix_file.\n"); # Read the input file. my $line_number = 0; my %motifs; my $line; while ($line = <$matrix_fp>) { $line_number++; #check the line is an id line my $id; my $alt; my $uniprobe; if ($line =~ m/^# ([^,\s]+)(?:\s+([^,\s]+))?/) { $id = $1; $alt = $2; if (! defined $alt) { if (%id_index) { $id =~ /([^_]+)_([^_]+)(_\w*)?/; my $article = $1; my $protein1 = $2; if ($article =~ /\.v\d+$/) { # remove end $article = substr($article, 0, $-[0]); } $protein1 =~ /^([^-]+)/; # in case there is a stupid "-" in the name my $protein2 = $1; my $uniprobe_id = $id_index{$article . $protein1}; $uniprobe_id = $id_index{$article . $protein2} unless defined $uniprobe_id; if (defined($uniprobe_id)) { $alt = $uniprobe_id; $uniprobe = $uniprobe_id; $uniprobe =~ s/^UP//; } else { printf STDERR "Can't find UniPROBE ID for article $article and protein $protein1\n"; } } } else { $uniprobe = $alt; $uniprobe =~ s/^UP//; } } else { die ("No matrix id found at line $line_number in file $matrix_file.\n"); } # Read the motif energy terms (RT units). One line for each nucleotide, # and convert to relative probabilities using exp(-x). my %rows = (); for(my $base_index = 0; $base_index < 4; $base_index++) { $line = <$matrix_fp>; $line_number++; die ("Missing motif counts at line $line_number in file $matrix_file.\n") unless (defined $line); chomp($line); if ($line =~ m/^([ACGT])(.*)$/i) { my $base = uc($1); $_ = $2; my $row = join(" ", map {2*exp(-$_)} split); # convert energy to relative probability $rows{$base} = $row; } else { die("Couldn't find base name at start of line."); } } # make a matrix I can convert into a motif my $matrix = $rows{A} . "\n" . $rows{C} . "\n" . $rows{G} . "\n" . $rows{T}; # make a url my $url; if ($uniprobe) { $url = $url_pattern; $url =~ s/MOTIF_NAME/$uniprobe/g; } # set the names my $motif_name = $id; # try to convert the motif my $nsites = 20; my ($motif, $errors) = matrix_to_intern(\%bg, $matrix, 'col', $nsites, $pseudo_total, id => $motif_name, alt => $alt, url => $url, rescale => 1); # display errors print(STDERR join("\n", @{$errors}), "\n") if @{$errors}; # output motif if ($motif) { print STDOUT intern_to_meme($motif, $print_logodds, 1, !($count++)); } else { $num_bad++; } } close($matrix_fp); } print(STDERR "Converted $count motifs.\n"); print(STDERR "$num_bad conversion errors.\n");