#!/usr/bin/python # Read GLAM2 output: write an HTML version of it import fileinput, os, tempfile, sys from string import replace embed_seqs = 0 version = '' commandline = '' alphabet = '?' alignments = [] text_alignments = [] freq_matrices = [] state = 0 usage = """ USAGE: %s \tConvert glam2 output to HTML. \tReads standard input. \tWrites standard output. """ % (sys.argv[0]) # parse command line i=1 while i < len(sys.argv): arg = sys.argv[i] print >> sys.stderr, "Unknown command line argument: " + arg; sys.exit(1) i += 1 for line in sys.stdin.readlines(): fields = line.split() if state == 0: if line.startswith('Version'): version = line elif line.find('glam2') != -1: commandline = line elif line.startswith('Residue counts'): if len(fields) == 4 + 3: alphabet = 'n' alen = 4 elif len(fields) == 20 + 3: alphabet = 'p' alen = 20 state += 1 elif state == 1: if len(fields) > 1 and fields[0] == 'Score:': score = fields[1] columns = int(fields[3]) sequences = int(fields[5]) state += 1 elif state == 2: if len(fields) == 1: keypos = fields[0] aln = [] alignments.append([score, keypos, aln]) text_aln = line text_alignments.append(text_aln) state += 1 elif state == 3: if len(fields) == 6: assert len(fields[2]) == len(keypos) aln.append(fields) text_alignments[len(text_alignments)-1] += line else: state += 1 elif state == 4: if len(fields) == alen + 3 and fields[alen] == 'Del': ipos = 0 pspm = [] freq_matrices.append([pspm, columns, sequences]) state += 1 elif state == 5: if len(fields) == alen + 2: pspm.append(fields[:alen]) ipos += 1 if ipos == columns: state = 1 assert len(alignments) > 0 # print the HTML header: print '' print '' print 'GLAM2' print '' print '' print '' print '

GLAM2: Gapped Local Alignment of Motifs

' print '

', version, '
', commandline, '

' print '

If you use this program in your research, please cite: \ MC Frith, NFW Saunders, B Kobe, TL Bailey, "Discovering sequence motifs with arbitrary insertions and deletions", PLoS Computational Biology, 4(5):e1000071, 2008.\ \ [full text]

' print '

' print '' # print the command line parameters as hidden fields fields = commandline.split() i = 1 while i < len(fields)-2: # simple flags if (fields[i] == '-2' or fields[i] == '-Q'): print '' i = i+1 elif (fields[i] == '-M'): # print the input sequences embed_seqs = 1 i = i+1 elif (fields[i] == '-A'): # print the email address i = i+1 address = replace(fields[i], '\'', ''); print '' print '' i = i+1 elif (fields[i] == '-X'): # print description field; replace '&' with ' ' i = i+1 description = replace(fields[i], '&', ' '); description = replace(description, '\'', ''); print '' i = i+1 else: # other switches with values print '' i = i+2 print '' i = i+1 print '' def regexp(aln, keypos): '''Write a regexp representation of a motif, similarly to MEME.''' alignedSequences = [row[2] for row in aln] columns = zip(*alignedSequences) regexpString = '' index = 0 while index != -1: # loop over aligned columns # get the regular expression component for this aligned column: residueCounts = {} for residue in columns[index]: if residue.isalpha(): residueCounts[residue] = residueCounts.get(residue, 0) + 1 matchCount = sum(residueCounts.values()) residues = [k for k, v in residueCounts.items() if v*5 >= matchCount and v > 0] if len(residues) == 0: regexpString += '.' elif len(residues) == 1: regexpString += residues[0] else: regexpString += '[' + ''.join(residues) + ']' if '.' in columns[index]: regexpString += '?' # bump the loop variable: start = index + 1 index = keypos.find('*', start) # find the next aligned column # get the regular expression component for the insertion, if any: if index > start: maxInsert = index - start gapSizes = [s[start:index].count('.') for s in alignedSequences] minInsert = maxInsert - max(gapSizes) regexpString += '.{' + str(minInsert) + ',' + str(maxInsert) + '}' print '', regexpString, '' def alntable(aln, keypos): print '' print '' print '' print '' print '' print '' print '' print '' print '' print '' for row in aln: print '' print '' print '' for i in range(len(keypos)): if keypos[i] == '*': print '' else: print '' print '' print '' print '' print '' print '' print '

NAME	START	SITES	END	STRAND	MARGINAL SCORE
', row[0], '	', row[1], '	', row[2][i], '	', row[2][i], '	', row[3], '	', row[4], '	', row[5], '

' def print_aln(imotif, text_aln): print ' against sequence databases using GLAM2SCAN.' % (imotif) # The "dots" in the value field are needed so Firefox doesn't remove whitespace! print '' print '
' % (imotif) def print_pspm(imotif, pspm, columns, sequences, alphabet): print '' print '' % (imotif) if alphabet == 'n': print '
to known motifs in motif databases using Tomtom.' % (imotif) def seqlogo(imotif, score): print '' print '' print '

logo with ssc
Score: ', score, '

' for imotif in range(len(alignments)): a = alignments[imotif] t = text_alignments[imotif] f = freq_matrices[imotif] if imotif == 0: print '

Best Motif Found:

' alntable(a[2], a[1]) elif imotif == 1: print '

Replicates:

' print '

Check that at least some of these are similar to the best motif found.
If not, there may well be an even better motif.
Click here to ' if embed_seqs: print '' else: print 're-run GLAM2' print 'with double the "maximum number of iterations without improvement" to find it.

' print '

' seqlogo(imotif+1, a[0]) print_aln(imotif+1, t) print_pspm(imotif+1, f[0], f[1], f[2], alphabet) print '
Regular Expression for Motif: ' regexp(a[2], a[1]) print '
' if len(alignments) < 2: print 'No replicates were performed!' print '

' # print the embedded sequences if (embed_seqs): f = open(fields[i], 'r') print '' f.close() # close the HTML: print '

' print ''