#!/usr/bin/perl -w # # FILE: nmica2meme # AUTHOR: Timothy L. Bailey # CREATE DATE: 16/10/2010 # DESCRIPTION: Convert a file containing list of TFBS motif matrices from # nestedMICA (BioTiffin/XMS) format to MEME output format. use warnings; use strict; use lib qw(/mit/meme_v4.11.4/lib/perl); use Alphabet qw(dna); use MotifUtils qw(matrix_to_intern intern_to_meme read_background_file); use Fcntl qw(O_RDONLY); use Getopt::Long; use Pod::Usage; use XML::Simple; use Data::Dumper; =head1 SYNOPSIS nmica2meme [options] Options: -skip skip this ID (may be repeated) -numseqs assume frequencies based on this many sequences; default: 20 -truncate_names truncate motif names at first underscore -numbers use numbers instead of strings as motif names -bg file with background frequencies of letters; default: uniform background -pseudo add times letter background to each frequency; default: 0 -logodds print log-odds matrix, too; default: print frequency matrix only -url website for the motif; The untruncated ID is substituted for MOTIF_NAME; default: no url -h print usage message Read a nestedMICA (BioTiffin/XMS) matrix file and convert to MEME format. Note only DNA motifs are supported. Reads standard input. Writes standard output =cut # Set option defaults my %skips; my $num_seqs = 20; my $truncate_names = 0; my $use_numbers = 0; my $bg_file; my $pseudo_total = 0; # default total pseudocounts my $print_logodds = 0; my $url_pattern = ''; my $help; # XMS base names my @bases = ('adenine', 'cytosine', 'guanine', 'thymine'); GetOptions("skip=s" => sub {$skips{$_[1]} = 1}, "numseqs=i" => \$num_seqs, "truncate_names" => \$truncate_names, "numbers" => \$use_numbers, "bg=s" => \$bg_file, "pseudo=f" => \$pseudo_total, "logodds" => \$print_logodds, "url=s" => \$url_pattern, "h" => \$help) or pod2usage(2); #check num seqs pod2usage("Option -numseqs must have a positive value.") if ($num_seqs < 0); #check pseudo total pod2usage("Option -pseudo must have a positive value.") if ($pseudo_total < 0); # output help if requested pod2usage(1) if $help; # Get the background model. my %bg = &read_background_file(&dna(), $bg_file); #DNA background file # Read the input file. my $xml = new XML::Simple(KeepRoot => 1, ForceArray => ['motif', 'column'], KeyAttr => {'motif' => 'name', 'weight' => 'symbol'}, ContentKey => '-content'); my $data; eval { $data = $xml->XMLin('-'); # read from standard input }; if ($@) { pod2usage("Bad file on standard input."); } #print STDERR Dumper($data); $data = $data->{'motifset'} if (defined($data->{motifset})); my $motifs = $data->{motif}; my $n = @bases; my $num_motifs = 0; my $num_skipped = 0; foreach my $motif_name (sort keys %{$motifs}) { my $weight_matrix = $motifs->{$motif_name}->{weightmatrix}; if ($weight_matrix->{'alphabet'} ne 'DNA') { print(STDERR "Skipped $motif_name as only DNA motifs are supported.\n"); $num_skipped++; next; } my $w = $weight_matrix->{columns}; my $cols = $weight_matrix->{column}; my $matrix = ''; # loop over columns of motif foreach my $col (@{$cols}) { # get the frequencies for this column my $freqs = $col->{weight}; # create an ascii matrix for (my $i = 0; $i < scalar(@bases); $i++) { $matrix .= " " . $freqs->{$bases[$i]}; } $matrix .= "\n"; } # convert and print ascii matrix my ($motif, $errors) = matrix_to_intern(\%bg, $matrix, 'row', $num_seqs, $pseudo_total, id => $motif_name); print intern_to_meme($motif, $print_logodds, 1, !($num_motifs++)) if $motif; print(STDERR join("\n", @{$errors}), "\n") if @{$errors}; } print(STDERR "Converted $num_motifs motifs.\n"); print(STDERR "Skipped $num_skipped motifs.\n");