#!/usr/bin/perl use strict; use warnings; use Fcntl qw(O_CREAT O_WRONLY O_TRUNC); use Getopt::Long; use Pod::Usage; =head1 NAME fasta-re-match - reports all matches to a IUPAC DNA motif, 1 per line. =head1 SYNOPSIS fasta-re-match [options] Options: -norc Only find matches to motifs in the given strand -erase erases this motif before finding matches; repeatable; order matters! -help prints this help message Reads sequences from standard input. Writes to standard output tab separated (space padded) columns: If you are trying to recreate DREME motif sites note that DREME erases previously found motifs so you will have to use the -erase option for any but the first motif, like: fasta-re-match -erase CCMCRCCC TTATCW < sample-dna-Klf1.fa If you want a count of sites try piping the output to "wc -l" like: fasta-re-match CCMCRCCC < sample-dna-Klf1.fa | wc -l If you want only one of the columns try piping the output to "cut -f " like: fasta-re-match CCMCRCCC < sample-dna-Klf1.fa | cut -f 1 =cut my $help = 0; #FALSE my $norc = 0; #FALSE my @erase = (); my $iupac = undef; my @erase_re = (); my $motif_re = undef; my $motif_given_re = undef; GetOptions( "norc" => \$norc, "erase=s" => \@erase, "help|?" => \$help ) or pod2usage(2); pod2usage(1) if ($help); ($iupac) = @ARGV; # check and create erasure motif REs for (my $i = 0; $i < scalar(@erase); $i++) { $erase[$i] = uc($erase[$i]); pod2usage('Value "'. $erase[$i]. '" is invalid for option erase '. '(non IUPAC DNA characters)') if ($erase[$i] =~ /[^ACGTRYSWKMBDHVN]/); push(@erase_re, &iupac_re($erase[$i], $norc)); } # check and create search motif RE pod2usage("No motif given") unless ($iupac); $iupac = uc($iupac); pod2usage("Non IUPAC DNA characters") if ($iupac =~ /[^ACGTRYSWKMBDHVN]/); $motif_re = &iupac_re($iupac, $norc); $motif_given_re = &iupac_re($iupac, 1); my $line; my $seq; my $id; my @pos = (); my $in_seq = 0; #FALSE my $lineno = 1; while ($line = ) { if ($line =~ m/^>(\S+)/) { &print_matches(); $id = $1; $in_seq = 1; #TRUE @pos = (); $seq = ''; } else { if ($in_seq) { while ($line =~ /(\S+)/g) { push(@pos, [length($seq), $lineno, $-[0]]); $seq .= $1; } } } $lineno++; } &print_matches(); pod2usage("Empty sequences file") unless $in_seq; exit(0); sub print_matches { return unless $in_seq; # erase any requested motifs for (my $i = 0; $i < scalar(@erase_re); $i++) { my $re = $erase_re[$i]; my $sub = 'N' x length($erase[$i]); $seq =~ s/$re/$sub/g; } while ($seq =~ /$motif_re/g) { my $match = $1; my $offset = $-[0]; my ($line, $col) = &lookup_pos($offset); my $strand = $match =~ m/^$motif_given_re$/ ? '+' : '-'; printf("%s\t%s\t%4d\t%3d\t%5d\t%s\n", $match, $strand, $line, $col, $offset, $id); } } sub lookup_pos { my ($target) = @_; my $min = 0; my $max = scalar(@pos)-1; while ($min < $max) { use integer; my $mid = ($min + $max) / 2; if ($pos[$mid]->[0] > $target) { $max = $mid - 1; } elsif ($pos[$mid+1]->[0] <= $target) { $min = $mid + 1; } else { $min = $mid; $max = $mid; } } my @info = @{$pos[$min]}; return ($info[1], $info[2] + ($target - $info[0]) + 1); } sub iupac_to_bracket { my ($in) = @_; $in =~ s/A/[A]/g; $in =~ s/C/[C]/g; $in =~ s/G/[G]/g; $in =~ s/T/[T]/g; $in =~ s/N/[ACGT]/g; $in =~ s/V/[ACG]/g; $in =~ s/H/[ACT]/g; $in =~ s/D/[AGT]/g; $in =~ s/B/[CGT]/g; $in =~ s/M/[AC]/g; $in =~ s/K/[GT]/g; $in =~ s/W/[AT]/g; $in =~ s/S/[GC]/g; $in =~ s/Y/[CT]/g; $in =~ s/R/[AG]/g; $in =~ s/A/Aa/g; $in =~ s/C/Cc/g; $in =~ s/G/Gg/g; $in =~ s/T/TtUu/g; return $in; } sub iupac_re { my ($orig, $norc) = @_; my $re = &iupac_to_bracket($orig); unless ($norc) { my $revc = reverse($orig); $revc =~ tr/ACGTRYSWKMBDHVN/TGCAYRSWMKVHDBN/; $re = $re . '|' . &iupac_to_bracket($revc); } return qr/($re)/; } 1;