#!/usr/bin/perl =head1 NAME fasta-subsample - Takes a (normally random) subset of sequences in a FASTA file. =head1 SYNOPSIS fasta-subsample [-seed ] [-norand] [-rest ] [-off ] [-len ] name of FASTA sequence file number of sequences to output [-seed ] random number seed; default: 1 [-norand] disable random selection and selects sequences in file order; overrides the -seed option [-rest ] name of file to receive the FASTA sequences not being output; default: none [-off ] print starting at position in each sequence; default: 1 [-len ] print up to characters for each sequence; default: print entire sequence Output a subsample of size of the sequences in a FASTA sequence file. By default the sequences will be selected using a random number generator that is always seeded to 1 resulting in the same subset of sequences always being output. You may modify the seed of the random number generator and hence the selected subset by using the -seed option or alternatively you may disable it altogether with the -norand option and just select the first sequences in the file. If requested, the remaining sequences will be output to a file named , which is useful for cross-validation. You can also choose to only output portions of each sequence using the -off and -len switches. If the sequences have UCSC BED file headers (e.g., ">chr1:0-99999"), the headers will be adjusted to reflect -off and -len. Writes to standard output. =cut use strict; use warnings; use Getopt::Long; use Pod::Usage; # constants my $line_size = 50; # size of lines to print ################################################################################ # print (a portion of) a FASTA sequence # Assumes FASTA file is open and the index contains # the file byte offset for a given ID. ################################################################################ sub print_fasta_seq_portion { my ($fasta, $output, $id, $off, $len, $index) = @_; my $addr = $index->{$id}; # address of sequence die "Can't find target $id.\n" unless (defined($addr)); seek($fasta, $addr, 0); # move to start of target # save ID line my $id_line = <$fasta>; my $seq = ""; # read in sequence lines for this sequence while (<$fasta>) { # read sequence lines if (/^>/) {last} # start of next sequence chop; $seq .= $_; } # get length of sequence my $length = length($seq); # print ID of FASTA sequence $_ = $id_line; if (/^(>chr[\dXY]+):(\d+)-(\d+)/i) { # handle BED format my $chr = $1; my $start = $2; my $end = $3; my $comment = $'; $start = $start + ($off - 1); # new start; 0-based # BED end is really "end+1" $end = ($len != -1) ? $start + $len : $start + $length; # print ID for the sequence in BED format, adjusting for offset and length printf($output "%s:%s-%s%s", $chr, $start, $end, $comment); } else { # handle other formats printf($output $_); # print ID for this sequence } # print sequence in lines of length $line_size # get portion of sequence to print if -off and/or -len given if ($off != 1 || $len != -1) { if ($len == -1) { $seq = substr($seq, $off-1); } else { $seq = substr($seq, $off-1, $len); } } for (my $i = 0; $i < length($seq); $i += $line_size) { print $output substr($seq, $i, $line_size), "\n"; } } ################################################################################ # shuffle a list in place ################################################################################ sub shuffle { my $r=pop; $a = $_ + rand @{$r} - $_ and @$r[$_, $a] = @$r[$a, $_] for (0..$#{$r}); } ################################################################################ # Main program ################################################################################ # set defaults my $seed = 1; # random number seed my $norand = 0; # when enabled do not shuffle! Instead pick the first sequences as the subsample my $off = 1; # first position of sequence to print my $len = -1; # maximum length of sequence to print my $rest = ""; # file to receive remainder of seqs my $help = 0; # when enabled show help # read settings GetOptions( 'seed=i' => \$seed, 'norand' => \$norand, 'rest=s' => \$rest, 'off=i' => \$off, 'len=i' => \$len, 'help|?' => \$help ) or pod2usage(2); pod2usage(1) if $help; pod2usage("Missing required arguments.") if (scalar(@ARGV) < 2); my ($fasta, $n) = @ARGV; pod2usage("Required FASTA file \"$fasta\" does not exist.") unless (-e $fasta); pod2usage("Required sequence count \"$n\" is not a whole number.") unless ($n =~ m/^\d+$/); # read in FASTA file and make index open(FASTA, "<$fasta") || die "Couldn't open file `$fasta'.\n"; my $byte = 0; my %index; # ID-to-start index my $id; # sequence ID my @rest; # dummy my @id_list; # list of all IDs while () { if (/^>/) { ($id, @rest) = split; $index{$id} = $byte; # start of sequence record push @id_list, $id; } $byte += length; } # read FASTA file # check that there are enough IDs my $nseqs = @id_list; die ("Not enough sequences ($nseqs); $n requested.\n") if ($nseqs < $n); unless ($norand) { # shuffle the list of IDs srand($seed); shuffle(\@id_list); #print join " ", @id_list, "\n"; } # output the requested number of FASTA sequences to STDOUT foreach $id (@id_list[0..$n-1]) { print_fasta_seq_portion(*FASTA, *STDOUT, $id, $off, $len, \%index); } # id # output the remainder of the sequences if requested # to the "rest" file if ($rest) { open(REST, ">$rest") || die("Can't open file `$rest'.\n"); foreach $id (@id_list[$n..$nseqs-1]) { print_fasta_seq_portion(*FASTA, *REST, $id, $off, $len, \%index); } } 1;