#!/usr/bin/perl -w # # FILE: sites2meme # AUTHOR: James Johnson # CREATE DATE: 25/3/2014 # DESCRIPTION: Convert files containing motif sites into MEME motifs # use warnings; use strict; use lib qw(/mit/meme_v4.11.4/lib/perl); use MotifUtils qw(seq_to_intern intern_to_meme load_alphabet read_background_file); use Fcntl qw(O_RDONLY); use File::Spec::Functions qw(catfile); use Getopt::Long; use Pod::Usage; =head1 NAME sites2meme - Converts files containing motif sites into MEME motifs =head1 SYNOPSIS sites2meme [options] Options: -ext the file extension (with '.') of the sites files; the file name minus the extension will be used as the motif identifer; default: expect an extension of ".txt" -map tab separated file containing id, name pairs. -protein Sets the expected alphabet to protein; default: the expected alphabet is DNA -alph Set the expected alphabet to the defintion in the file; default: DNA -bg file with background frequencies of letters; default: use uniform background -pseudo add times letter background to each frequency; default: 0 -logodds print log-odds matrix, too; default: print frequency matrix only -url website for the motif; The motif id is substituted for MOTIF_NAME Each file in the directory is assumed to match the pattern ID.txt where ID is the motif identifier. Each file should contain a newline separated list of sites. Writes to standard output. =cut # globals my ($bg, $pseudo_total, $url_pattern, $print_logodds, $count, $num_bad); sub main { # set global defaults $bg = undef; # loaded below $pseudo_total = 0; $url_pattern = ""; $print_logodds = 0; # FALSE $count = 0; $num_bad = 0; # Set option defaults (excluding globals) my $sites_ext = ".txt"; my $map_file = undef; my $bg_file = undef; my $is_dna = 1; # TRUE my $alph_file = undef; # process the arguments list GetOptions( "ext=s" => \$sites_ext, "map=s" => \$map_file, "bg=s" => \$bg_file, "pseudo=f" => \$pseudo_total, "url=s" => \$url_pattern, "logodds" => \$print_logodds, "alph=s" => \$alph_file, "protein" => sub {$is_dna = 0;} ) or pod2usage(2); pod2usage("A directory containing site motifs must be specified for the conversion.") unless @ARGV; my ($motif_dir) = @ARGV; # read the background file my %bg_val = &read_background_file(&load_alphabet($is_dna, $alph_file), $bg_file); $bg = \%bg_val; # read the identifier mappings my $map = &read_map($map_file); # read the specified directory my ($motif_dh, $motif_file); opendir($motif_dh, $motif_dir) || die("Cannot open directory: $!"); my @motif_files = sort { $a cmp $b } readdir($motif_dh); closedir($motif_dh); while ($motif_file = shift @motif_files) { # check if the file has the required extension if (substr($motif_file, -length($sites_ext)) eq $sites_ext) { # Read the motif information my $id = substr($motif_file, 0, length($motif_file) - length($sites_ext)); my $alt = $map->{$id}; my $sites = &read_sites(catfile($motif_dir, $motif_file)); # write out the MEME motif &write_motif($id, $alt, $sites); } } print(STDERR "Converted $count motifs.\n"); print(STDERR "$num_bad conversion errors.\n"); } sub read_sites { my ($motif_file) = @_; local $/ = undef; my $motif_fh; sysopen($motif_fh, $motif_file, O_RDONLY) || die("Can't open $motif_file.\n"); my $sites = <$motif_fh>; close($motif_fh); return $sites; } sub read_map { my ($map_file) = @_; my %map = (); if (defined($map_file)) { my ($map_fh, $line); sysopen($map_fh, $map_file, O_RDONLY) || die("Can't open $map_file.\n"); while ($line = <$map_fh>) { if ($line =~ m/^\s*(\S+)\s+(\S+)\s*$/) { $map{$1} = $2; } } close($map_fh); } return \%map; } sub write_motif { my ($id, $alt, $sites) = @_; my $url = $url_pattern; $url =~ s/MOTIF_NAME/$id/g; my ($motif, $errors) = seq_to_intern($bg, $sites, 1, $pseudo_total, id => $id, alt => $alt, url => $url, also => '-'); # display errors if (@{$errors}) { print STDERR "Motif ", $id, " Errors:\n", join("\n", @{$errors}), "\n"; } # output motif if ($motif && scalar(@{$errors}) == 0) { print STDOUT intern_to_meme($motif, $print_logodds, 1, !($count++)); } else { $num_bad++; } } main(); 1;