%\VignetteIndexEntry{Biostrings Quick Overview} %\VignetteKeywords{DNA, RNA, Sequence, Biostrings, Sequence alignment} %\VignettePackage{Biostrings} % % NOTE -- ONLY EDIT THE .Rnw FILE!!! The .tex file is % likely to be overwritten. % \documentclass[10pt]{article} \usepackage{times} \usepackage{hyperref} \usepackage[margin=0.65in]{geometry} \newcommand{\scscst}{\scriptscriptstyle} \newcommand{\scst}{\scriptstyle} \newcommand{\R}{{\textsf{R}}} \newcommand{\code}[1]{{\texttt{#1}}} \newcommand{\term}[1]{{\emph{#1}}} \newcommand{\Rpackage}[1]{\textsf{#1}} \newcommand{\Rfunction}[1]{\texttt{#1}} \newcommand{\Robject}[1]{\texttt{#1}} \newcommand{\Rclass}[1]{{\textit{#1}}} \newcommand{\Rmethod}[1]{{\textit{#1}}} \newcommand{\Rfunarg}[1]{{\textit{#1}}} \bibliographystyle{plainnat} \begin{document} %\setkeys{Gin}{width=0.55\textwidth} \title{Biostrings Quick Overview} \author{Herv\'e Pag\`es \\ Fred Hutchinson Cancer Research Center \\ Seattle, WA} \date{\today} \maketitle %\tableofcontents Please note that \emph{most} but \emph{not all} the functionalities provided by the \Rpackage{Biostrings} package are listed in this document. %----------------------------------------------------------------------------- \begin{table}[ht] \begin{center} \begin{tabular}{p{2.5in}|p{4in}} {\bf Function} & {\bf Description} \\ \hline \Rfunction{length} & Return the number of sequences in an object. \\ \hline \Rfunction{names} & Return the names of the sequences in an object. \\ \hline \Rfunction{[} & Extract sequences from an object. \\ \hline \Rfunction{head}, \Rfunction{tail} & Extract the first or last sequences from an object. \\ \hline \Rfunction{rev} & Reverse the order of the sequences in an object. \\ \hline \Rfunction{c} & Combine in a single object the sequences from 2 or more objects. \\ \hline \Rfunction{width}, \Rfunction{nchar} & Return the sizes (i.e. number of letters) of all the sequences in an object.\\ \hline \Rfunction{==}, \Rfunction{!=} & Element-wise comparison of the sequences in 2 objects. \\ \hline \Rfunction{match}, \Rfunction{\%in\%} & Analog to \Rfunction{match} and \Rfunction{\%in\%} on character vectors. \\ \hline \Rfunction{duplicated}, \Rfunction{unique} & Analog to \Rfunction{duplicated} and \Rfunction{unique} on character vectors. \\ \hline \Rfunction{sort}, \Rfunction{order} & Analog to \Rfunction{sort} and \Rfunction{order} on character vectors, except that the ordering of DNA or Amino Acid sequences doesn't depend on the locale. \\ \hline \Rfunction{relist}, \Rfunction{split}, \Rfunction{extractList} & Analog to \Rfunction{relist} and \Rfunction{split} on character vectors, except that the result is a \Rclass{DNAStringSetList} or \Rclass{AAStringSetList} object. \Rfunction{extractList} is a generalization of \Rfunction{relist} and \Rfunction{split} that supports \emph{arbitrary} groupings. \\ \hline \end{tabular} \end{center} \caption{\bf Low-level manipulation of \Rclass{DNAStringSet} and \Rclass{AAStringSet} objects.} \label{table:Low_level_manipulation} \end{table} %----------------------------------------------------------------------------- \begin{table}[ht] \begin{center} \begin{tabular}{p{2.5in}|p{4in}} {\bf Function} & {\bf Description} \\ \hline \Rfunction{alphabetFrequency}\par \Rfunction{letterFrequency} & Tabulate the letters (all the letters in the alphabet for \Rfunction{alphabetFrequency}, only the specified letters for \Rfunction{letterFrequency}) of a sequence or set of sequences. \\ \hline \Rfunction{letterFrequencyInSlidingView} & Specialized version of \Rfunction{letterFrequency} that tallies the requested letter frequencies for a fixed-width view that is conceptually slid along the input sequence. \\ \hline \Rfunction{consensusMatrix} & Computes the consensus matrix of a set of sequences. \\ \hline \Rfunction{dinucleotideFrequency}\par \Rfunction{trinucleotideFrequency}\par \Rfunction{oligonucleotideFrequency} & Fast 2-mer, 3-mer, and k-mer counting for DNA or RNA. \\ \hline \Rfunction{nucleotideFrequencyAt} & Tallies the short sequences formed by extracting the nucleotides found at a set of fixed positions from each sequence of a set of DNA or RNA sequences. \\ \hline \end{tabular} \end{center} \caption{\bf Counting / tabulating.} \label{table:Counting_tabulating} \end{table} %----------------------------------------------------------------------------- \begin{table}[ht] \begin{center} \begin{tabular}{p{2.5in}|p{4in}} {\bf Function} & {\bf Description} \\ \hline \Rfunction{reverse}\par \Rfunction{complement}\par \Rfunction{reverseComplement} & Compute the reverse, complement, or reverse-complement, of a set of DNA sequences. \\ \hline \Rfunction{translate} & Translate a set of DNA sequences into a set of Amino Acid sequences. \\ \hline \Rfunction{chartr} & Translate the letters in a set of sequences. \\ \hline \Rfunction{subseq}, \Rfunction{subseq<-}\par \Rfunction{extractAt}, \Rfunction{replaceAt} & Extract/replace arbitrary substrings from/in a string or set of strings. \\ \hline \Rfunction{replaceLetterAt} & Replace the letters specified by a set of positions by new letters. \\ \hline \Rfunction{padAndClip}, \Rfunction{stackStrings} & Pad and clip strings. \\ \hline \Rfunction{strsplit}, \Rfunction{unstrsplit} & \Rfunction{strsplit} splits the sequences in a set of sequences according to a pattern. \Rfunction{unstrsplit} is the reverse operation i.e. a fast implementation of \code{sapply(x, paste0, collapse=sep)} for collapsing the list elements of a \Rclass{DNAStringSetList} or \Rclass{AAStringSetList} object. \\ \hline \end{tabular} \end{center} \caption{\bf Sequence transformation and editing.} \label{table:Sequence_editing} \end{table} %----------------------------------------------------------------------------- \begin{table}[ht] \begin{center} \begin{tabular}{p{2.5in}|p{4in}} {\bf Function} & {\bf Description} \\ \hline \Rfunction{matchPattern}\par \Rfunction{countPattern} & Find/count all the occurrences of a given pattern (typically short) in a reference sequence (typically long). Support mismatches and indels. \\ \hline \Rfunction{vmatchPattern}\par \Rfunction{vcountPattern} & Find/count all the occurrences of a given pattern (typically short) in a set of reference sequences. Support mismatches and indels. \\ \hline \Rfunction{matchPDict}\par \Rfunction{countPDict}\par \Rfunction{whichPDict} & Find/count all the occurrences of a set of patterns in a reference sequence. (\Rfunction{whichPDict} only identifies which patterns in the set have at least one match.) Support a small number of mismatches. \\ \hline \Rfunction{vmatchPDict}\par \Rfunction{vcountPDict}\par \Rfunction{vwhichPDict} & [Note: \Rfunction{vmatchPDict} not implemented yet.] Find/count all the occurrences of a set of patterns in a set of reference sequences. (\Rfunction{whichPDict} only identifies for each reference sequence which patterns in the set have at least one match.) Support a small number of mismatches. \\ \hline \Rfunction{pairwiseAlignment} & Solve (Needleman-Wunsch) global alignment, (Smith-Waterman) local alignment, and (ends-free) overlap alignment problems. \\ \hline \Rfunction{matchPWM}\par \Rfunction{countPWM} & Find/count all the occurrences of a Position Weight Matrix in a reference sequence. \\ \hline \Rfunction{trimLRPatterns} & Trim left and/or right flanking patterns from sequences. \\ \hline \Rfunction{matchLRPatterns} & Find all paired matches in a reference sequence i.e. matches specified by a left and a right pattern, and a maximum distance between them. \\ \hline \Rfunction{matchProbePair} & Find all the amplicons that match a pair of probes in a reference sequence. \\ \hline \Rfunction{findPalindromes} & Find palindromic regions in a sequence. \\ \hline \end{tabular} \end{center} \caption{\bf String matching / alignments.} \label{table:String_matching_alignments} \end{table} %----------------------------------------------------------------------------- \begin{table}[ht] \begin{center} \begin{tabular}{p{2.5in}|p{4in}} {\bf Function} & {\bf Description} \\ \hline \Rfunction{readBStringSet}\par \Rfunction{readDNAStringSet}\par \Rfunction{readRNAStringSet}\par \Rfunction{readAAStringSet} & Read ordinary/DNA/RNA/Amino Acid sequences from files (FASTA or FASTQ format). \\ \hline \Rfunction{writeXStringSet} & Write sequences to a file (FASTA or FASTQ format). \\ \hline \Rfunction{writePairwiseAlignments} & Write pairwise alignments (as produced by \Rfunction{pairwiseAlignment}) to a file (``pair'' format). \\ \hline \Rfunction{readDNAMultipleAlignment}\par \Rfunction{readRNAMultipleAlignment}\par \Rfunction{readAAMultipleAlignment}& Read multiple alignments from a file (FASTA, ``stockholm'', or ``clustal'' format). \\ \hline \Rfunction{write.phylip} & Write multiple alignments to a file (Phylip format). \\ \hline \end{tabular} \end{center} \caption{\bf I/O functions.} \label{table:I_O_functions} \end{table} %----------------------------------------------------------------------------- \begin{table}[ht] \begin{center} \begin{tabular}{p{2.5in}|p{4in}} {\bf Function} & {\bf Description} \\ \hline \Rfunction{stringDist} & Computes the matrix of Levenshtein edit distances, or Hamming distances, or pairwise alignment scores, for a set of strings. \\ \hline \end{tabular} \end{center} \caption{\bf Miscellaneous.} \label{table:Miscellaneous} \end{table} \end{document}