#!/usr/bin/perl -w # FILE: transfac2meme # AUTHOR: William Stafford Noble and Timothy L. Bailey # CREATE DATE: 4/22/99 # DESCRIPTION: Convert a Transfac matrix file to MEME output format. use strict; use warnings; use lib qw(/mit/meme_v4.11.4/lib/perl); use Alphabet qw(dna rna); use MotifUtils qw(matrix_to_intern intern_to_meme read_background_file); use Fcntl qw(O_RDONLY); use Getopt::Long; use Pod::Usage; =head1 SYNOPSIS transfac2meme [options] Options: -rna output an RNA database instead of a DNA database. -numbers use numbers instead of strings as motif names -use_acc use accession names ("AC") instead of IDs -use_name use names ("NA") instead of IDs -ids keep any motifs listed in the file -species keep only motifs for this species -skip skip this ID (may be repeated) -bg file with background frequencies of letters; default: uniform background -pseudo add times letter background to each frequency; default: 0 -logodds print log-odds matrix, too; default: print frequency matrix only -url website for the motif; The ID (or accession) is substituted for MOTIF_NAME, the accession is substituted for MOTIF_AC and the motif ID is substituted for MOTIF_ID; default: no url Converts a transfac matrix.dat file into MEME motifs. The name of the motif is taken from the "ID" line by default, but the "AC" or "NA" lines can be used instead. The "NA" name, if present, is used as the secondary motif name, separated by a space from the primary name, except when it is requested to be the primary name. The IUPAC consensus letter at the end of each matrix line is optional, so this program supports some "TRANSFAC-like" formats as well. N.B. Dollar signs in TRANSFAC IDs are converted to underscores. Writes to standard output. =cut # Set option defaults my $is_rna = 0; my $use_numbers = 0; my $id_type = "ID"; my $species = ""; my %skips = (); my $id_file = ""; my $bg_file; my $pseudo_total = 0; my $print_logodds = 0; my $url_pattern = ""; my $matrix_file; GetOptions("rna" => \$is_rna, "numbers" => \$use_numbers, "use_acc" => sub {$id_type = "AC"}, "use_name" => sub {$id_type = "NA"}, "species=s" => \$species, "skip=s" => sub {$skips{$_[1]} = 1}, "ids=s" => \$id_file, "bg=s" => \$bg_file, "logodds" => \$print_logodds, "pseudo=f" => \$pseudo_total, "url=s" => \$url_pattern) or pod2usage(2); #check matrix file pod2usage("A matrix file must be specified for the conversion.") unless @ARGV; $matrix_file = shift(@ARGV); pod2usage("Only one matrix file may be specified for the conversion.") if @ARGV; #check pseudo total pod2usage("Option -pseudo must have a positive value.") if ($pseudo_total < 0); # read the background file my %bg = &read_background_file($is_rna ? &rna() : &dna(), $bg_file); my $line; # Store the target IDs. my %id_list = (); if ($id_file) { my $fp; sysopen($fp, $id_file, O_RDONLY) || die("Can't open \"$id_file\"."); while ($line = <$fp>) { chomp($line); my @words = split(' ', $line); foreach my $word (@words) { $id_list{$word} = 1; } } close($fp); } # Open the matrix file for reading. my $matrix_fp; sysopen($matrix_fp, $matrix_file, O_RDONLY) || die("Can't open \"$matrix_file\".\n"); # Read the input file. my $num_motifs = 0; my $num_skipped = 0; my $num_bad = 0; while ($line = <$matrix_fp>) { chomp($line); # Split the line into type and everything else. my ($type, $data) = split(/\s+/, $line, 2); next unless $type; # Have we reached a new matrix? if ($type eq $id_type) { my $matrix_name; chomp($matrix_name = $data); $matrix_name =~ tr/\$/_/; # Read to the beginning of the motif. my $this_species = ""; my $this_id = ""; my $this_accession = ""; my $this_name = ""; my $this_descr = ""; # Old versions of TRANSFAC use pee-zero; new use pee-oh. while (($type ne "PO") && ($type ne "P0")) { $line = <$matrix_fp>; if (! defined($line)) { die ("Can't find PO line for TRANSFAC matrix $matrix_name.\n"); } chomp($line); ($type, $data) = split(/\s+/, $line, 2); # Store the species line. if ($type eq "BF") { $this_species .= $data . "\n"; } # Store the accession line if ($type eq "AC") { $this_accession = $data;} # Store the ID line if ($type eq "ID") { $this_id = $data;} # Store the name line; replace whitespace with "_". if ($type eq "NA") { $this_name = $data; $this_name =~ s/\s+/_/g; } # Store the description line. if ($type eq "DE") { $this_descr = $data; } } # Read the motif. my $matrix = ""; while () { $line = <$matrix_fp>; die ("Can't find `XX' or `//' line for TRANSFAC matrix $matrix_name.\n") unless $line; chomp($line); ($type, $data) = split(/\s+/, $line, 2); # Look for the end of the motif. last if (($type eq "XX") || ($type eq "//")); # trim the (optional) IUPAC code off the end my @fields = split(/\s+/, $data); $data = join(" ", @fields[0 .. 3]); # append to the matrix $matrix .= $data . "\n"; } ###### Decide whether to print the motif. # If no criteria are given, then print it. my $print_it = 1; # If we were given a list, if ($id_file) { # was this matrix in the list? $print_it = 0 unless defined($id_list{$matrix_name}); } # If we were given a species. if ($species ne "") { # is this the right species? $print_it = 0 unless ($this_species =~ m/$species/); if ($this_species eq "") { print(STDERR "Warning: No species given for $matrix_name.\n"); } } # Were we explicitly asked to skip this one? if ($skips{$matrix_name}) { $print_it = 0; } # Print the motif. if ($print_it) { my $url = $url_pattern; $url =~ s/MOTIF_NAME/$matrix_name/g; $url =~ s/MOTIF_ID/$this_id/g if ($this_id ne ""); $url =~ s/MOTIF_AC/$this_accession/g if ($this_accession ne ""); my $name = ($use_numbers ? $num_motifs + 1 : $matrix_name); my $alt = ($use_numbers ? $matrix_name : $this_name); my ($motif, $errors) = matrix_to_intern(\%bg, $matrix, 'row', 1, $pseudo_total, id => $name, alt => $alt, url => $url); print(STDERR join("\n", @{$errors}, $name), "\n") if @{$errors}; $num_bad++ unless $motif; print intern_to_meme($motif, $print_logodds, 1, !($num_motifs++)) if $motif; } else { $num_skipped++; } } } print(STDERR "Converted $num_motifs motifs.\n"); print(STDERR "Skipped $num_skipped motifs.\n"); print(STDERR "$num_bad conversion errors.\n"); close($matrix_fp);