#!/usr/bin/perl -w # # $Id $ # $Log $ # # FILE: scpd2meme # AUTHOR: William Stafford Noble, Timothy L. Bailey, Charles Grant # CREATE DATE: 1/06/2006 # DESCRIPTION: Convert a file containing list of TFBS motif matrices from SCPD # to MEME output format. use warnings; use strict; use lib qw(/mit/meme_v4.11.4/lib/perl); use Alphabet qw(dna); use MotifUtils qw(matrix_to_intern intern_to_meme read_background_file); use Fcntl qw(O_RDONLY); use Getopt::Long; use Pod::Usage; =head1 SYNOPSIS scpd2meme [options] Options: -skip skip this ID (may be repeated) -numbers use numbers instead of strings as motif names -bg file with background frequencies of letters; default: uniform background -pseudo add times letter background to each frequency; default: 0 -logodds print log-odds matrix, too; default: print frequency matrix only -url website for the motif; The scpd ID is substituted for MOTIF_NAME; default: no url =cut # Set option defaults my %skips = (); my $use_numbers = 0; my $bg_file; my $pseudo_total = 0; my $print_logodds = 0; my $url_pattern = ""; my $matrix_file; GetOptions("skip=s" => sub {$skips{$_[1]} = 1}, "numbers" => \$use_numbers, "bg=s" => \$bg_file, "pseudo=f" => \$pseudo_total, "logodds" => \$print_logodds, "url=s" => \$url_pattern) or pod2usage(2); #check matrix file pod2usage("A matrix file must be specified for the conversion.") unless @ARGV; $matrix_file = shift(@ARGV); pod2usage("Only one matrix file may be specified for the conversion.") if @ARGV; #check pseudo total pod2usage("Option -pseudo must have a positive value.") if ($pseudo_total < 0); # read the background file my %bg = &read_background_file(&dna(), $bg_file); # Open the matrix file for reading. my $matrix_fp; sysopen($matrix_fp, $matrix_file, O_RDONLY) || die("Can't open $matrix_file.\n"); # Read the input file. my $count = 0; my $num_skipped = 0; my $num_bad = 0; my $line_number = 0; my %motifs; my $line; while ($line = <$matrix_fp>) { $line_number++; #check the line is an id line my $id; if ($line =~ m/^>(\S+)\b/) { $id = $1; } else { die ("No matrix id found at line $line_number in file $matrix_file.\n"); } # Read the motif base counts. One line for each nucleotide. my %rows = (); for(my $base_index = 0; $base_index < 4; $base_index++) { $line = <$matrix_fp>; $line_number++; die ("Missing motif counts at line $line_number in file $matrix_file.\n") unless (defined $line); chomp($line); if ($line =~ m/^([ACGT])(.*)$/i) { my $base = uc($1); my $row = $2; $rows{$base} = $row; } else { die("Couldn't find base name at start of line."); } } # skip unwanted motifs if ($skips{$id}) { $num_skipped++; next; } # make a matrix I can convert into a motif my $matrix = $rows{A} . "\n" . $rows{C} . "\n" . $rows{G} . "\n" . $rows{T}; # make a url my $url = $url_pattern; $url =~ s/MOTIF_NAME/$id/g; # set the names my $motif_name = ($use_numbers ? $count + 1 : $id); my $alt = ($use_numbers ? $id : ''); # try to convert the motif my ($motif, $errors) = matrix_to_intern(\%bg, $matrix, 'col', undef, $pseudo_total, id => $motif_name, alt => $alt, url => $url); # display errors print(STDERR join("\n", @{$errors}), "\n") if @{$errors}; # output motif if ($motif) { print STDOUT intern_to_meme($motif, $print_logodds, 1, !($count++)); } else { $num_bad++; } } print(STDERR "Converted $count motifs.\n"); print(STDERR "Skipped $num_skipped motifs.\n"); print(STDERR "$num_bad conversion errors.\n"); close($matrix_fp);