#!/usr/bin/perl -w # # FILE: taipale2meme # AUTHOR: James Johnson # CREATE DATE: 19/10/2010 # DESCRIPTION: Process tab separated files exported from spreadsheets (xls files). # use warnings; use strict; use lib qw(/mit/meme_v4.11.4/lib/perl); use Alphabet qw(dna); use MotifUtils qw(matrix_to_intern intern_to_meme read_background_file); use Getopt::Long; use Pod::Usage; =head1 NAME taipale2meme - Process tab separated value files that have been exported from spreadsheets to meme motifs. =head1 SYNOPSIS taipale2meme [options] Options: [-nc ]* file columns from which to create motif names by joining with "_"; default: first non-empty column [-oc ]* omit PWM if this column not empty -postfix text to append to motif names. -strands 1|2 print '+ -' or '+' on the MEME strand line; default: 2 (prints '+ -') -bg file with background frequencies of letters; default: uniform -pseudo add times background frequency to each count when computing letter frequencies default: 0 -logodds print log-odds matrix as well as frequency matrix; default: frequency matrix only -url website for the motif if it doesn't have one already; The motif name is substituted for MOTIF_NAME; default: use existing url -h print usage message Reads standard input. Writes standard output. =cut # Constants my $sites = 20; # set option defaults my @name_cols; my @omit_cols; my $postfix = ""; my $strands = 2; my $bg_file; my $pseudo_total = 0; my $print_logodds = 0; my $url_pattern = ""; my $help = 0; GetOptions( "nc=i" => \@name_cols, "oc=i" => \@omit_cols, "postfix=s" => \$postfix, "strands=i" => \$strands, "bg=s" => \$bg_file, "pseudo=f" => \$pseudo_total, "logodds" => \$print_logodds, "url=s" => \$url_pattern, "help|?" => \$help) or pod2usage(2); #check strands pod2usage("Option -strands must be either 1 or 2.") unless ($strands == 1 || $strands == 2); #check pseudo total pod2usage("Option -pseudo must have a positive value.") if ($pseudo_total < 0); pod2usage(1) if $help; #printf STDERR "%s\n", join(" ", @name_cols); my @l5l = ("", "", "", "", ""); my $matchA = qr/^(["']?)[aA]\1\t/; my $matchC = qr/^(["']?)[cC]\1\t/; my $matchG = qr/^(["']?)[gG]\1\t/; my $matchT = qr/^(["']?)[tT]\1\t/; # get the background model my %bg = &read_background_file(&dna(), $bg_file); my %dup_check = (); my %base_name = (); my %matrices = (); while (<>) { chomp; # skip blank lines next if (/^\s*$/); #update the last 5 lines push(@l5l, $_); shift(@l5l); #look for A, C G and T possibly wrapped with " or ' at the start of the last 4 lines if ($l5l[1] =~ $matchA && $l5l[2] =~ $matchC && $l5l[3] =~ $matchG && $l5l[4] =~ $matchT) { # try to extract the name my $name = ""; if (scalar(@name_cols)==0) { # name is first non-empty column foreach (split(/\t/,$l5l[0])) { if ($_ =~ m/^(["']?)(.+)\1$/) { $name = $2; last; } } } else { my $col = 0; $name = ""; my @fields = split(/\t/,$l5l[0]); foreach my $col (@name_cols) { $_ = $fields[$col-1]; if ($_ =~ m/^\s*(["']?)(.+)\1\s*$/) { $name .= $name ? "_$2" : $2; $name =~ s/ /_/g; # replace spaces with underscores } } } die("Missing motif name!\n") if ($name eq ""); $name .= $postfix; # Add _n to names (starting with _1) and save the base name for convenience my $base = $name; $dup_check{$base} = $dup_check{$base} ? $dup_check{$base}+1 : 1; $name .= "_" . $dup_check{$base}; $base_name{$name} = $base; # check that "omit" fields are empty my $keep = 1; foreach my $col (@omit_cols) { my @fields = split(/\t/,$l5l[0]); if ($fields[$col-1] ne "") { $keep = 0; last; } } # save the PWM matrix indexed by name if ($keep) { # extract the PSPM and convert it my $matrix = ""; for (my $i = 1; $i < 5; $i++) { my $line = $l5l[$i]; $line =~ s/^(["']?)[aAcCgGtT]\1\t//; $matrix .= $line . "\n"; } $matrices{$name} = $matrix; } #ensure no accidental reuse of data by clearing the cache @l5l = ("", "", "", "", ""); } } # Output the motifs only adding _n if more than 1 motif with same base_name my $num_motifs = 0; my $name; foreach $name (sort(keys %matrices)) { my $matrix = $matrices{$name}; # Strip off the _1 from the name if there is only one motif with the base_name. my $base = $base_name{$name}; $name = $base_name{$name} if ($dup_check{$base} == 1); my $url = $url_pattern; $url =~ s/MOTIF_NAME/$name/g; my ($motif, $errors) = matrix_to_intern(\%bg, $matrix, 'col', $sites, $pseudo_total, rescale => 1, id => $name, url => $url); print STDERR join("\n", @{$errors}), "\n" if @{$errors}; print intern_to_meme($motif, $print_logodds, 1, !($num_motifs++)) if $motif; }