import sys import creole_lexer as cl from tokens import * debug = False TOGGLE_MAP = {cl.Token.Inline.Bold: BOLD, cl.Token.Inline.Italic: ITALIC, cl.Token.Inline.Monospace: MONOSPACE, cl.Token.Inline.Superscript: SUPERSCRIPT, cl.Token.Inline.Subscript: SUBSCRIPT, cl.Token.Inline.Underline: UNDERLINE, #cl.Token.Inline.Strike: STRIKE, } OPEN_MAP = {cl.Token.Open.Codeblock: CODEBLOCK, cl.Token.Open.Code: MONOSPACE} CLOSE_MAP = {cl.Token.Close.Codeblock: CODEBLOCK, cl.Token.Close.Code: MONOSPACE} ENTITY_MAP = {cl.Token.Hrule: HRULE, cl.Token.LineBreak: LINEBREAK} LINK_START_MAP = {cl.Token.Start.Link: LINK, cl.Token.Start.Image: IMAGE} LINK_END_MAP = {cl.Token.End.Link: LINK, cl.Token.End.Image: IMAGE} LIST_MAP = {cl.Token.List.Unordered: UNORDERED_ITEM, cl.Token.List.Ordered: ORDERED_ITEM, cl.Token.Blockquote: BLOCKQUOTE_LINE} TABLE_MAP = {cl.Token.Table.Heading: TABLE_HEADING, cl.Token.Table.Cell: TABLE_CELL} PUNCT_MAP = {cl.Token.EmDash: u'\u2014', cl.Token.EnDash: u'\u2013', cl.Token.OpenQuote: u'\u201C', cl.Token.CloseQuote: u'\u201D', cl.Token.OpenSingleQuote: u'\u2018', cl.Token.Ellipsis: u'\u2026', cl.Token.DoubleBoth: u'\u21D4', cl.Token.SingleBoth: u'\u2194', cl.Token.DoubleRight: u'\u21D2', cl.Token.SingleRight: u'\u2192', cl.Token.DoubleLeft: u'\u21D0', cl.Token.SingleLeft: u'\u2190', } TILDE_OP = u'\u223C' TEXT = set((cl.Text, cl.Punctuation, cl.Token.ImplicitLinkText)) NL = (cl.Token.NL,) SP = (cl.Token.SP,) FLUSH_STACK_BEFORE = set(LIST_MAP.keys() + [cl.Token.ParaBreak, cl.Token.Heading, cl.Token.Macro.Start.Close, cl.Token.EOF]) SKIP_WHITESPACE_BEFORE = set(LIST_MAP.keys() + TABLE_MAP.keys() + [cl.Token.Heading, cl.Token.EOF, cl.Token.Table.End, cl.Token.List.End]) IGNORE = (cl.Token.Table.End, cl.Token.List.End) QUOTES = frozenset("'\"") GROUPING = {'(': ')', '[': ']'} def token_stream(markup): if markup.endswith('\n\n'): pass elif markup.endswith('\n'): markup += '\n' else: markup += '\n\n' if debug: print >>sys.stderr, repr(markup) lexer = cl.CreoleLexer() for t in lexer.get_tokens(markup): if t[0] != cl.Token.Nothing: yield t yield (cl.Token.EOF, '') TOKEN_STREAM_CACHE = {} # This cache has only marginal benefit... with performance improvements # deeper, it wouldn't make sense. def caching_token_stream(markup): if markup in TOKEN_STREAM_CACHE: return TOKEN_STREAM_CACHE[markup] else: l = list(token_stream(markup)) TOKEN_STREAM_CACHE[markup] = l return l def tokenize(markup, error_func=lambda:None): def gen_error(msg): error_func() return Entity(ERROR, msg) text_bits = [] stack = [] linkdest = None macroname = None arglist = None arglistquote = None arglistnest = [] eatspaces = False eatspacesnext = False implicit_link = False open_cell = None # Delayed until we see something other than eaten spaces, dropped if # that's a newline or EOF delayed_token = None open_quote_next = True last_quote = None for typ, toktext in caching_token_stream(markup): if debug: print >>sys.stderr, 'TOKEN:', (typ, toktext) if (stack and stack[-1][0] == HEADING and typ in NL + (cl.Token.ParaBreak, cl.Token.EOF, cl.Token.Macro.Start.Close)): while text_bits and not text_bits[-1].strip(): text_bits.pop() while text_bits and text_bits[-1] == '=': text_bits.pop() while text_bits and not text_bits[-1].strip(): text_bits.pop() if text_bits: yield Text(''.join(text_bits)) text_bits = [] yield End(HEADING, stack[-1][1]) stack.pop() if implicit_link and typ != cl.Token.ImplicitLinkText and typ != cl.Token.Escape: path = ''.join(text_bits) #print >>sys.stderr, ["End IL: ", typ, path[-1]] text_bits = [] if not path[-1].isalnum() and path[-1] != '/': text_bits.append(path[-1]) path = path[:-1] yield Entity(LINK, implicit_link + '://' + path) implicit_link = False if delayed_token is not None and typ not in SP: if typ not in NL and typ != cl.Token.EOF: yield Start(delayed_token) open_cell = delayed_token delayed_token = None if typ in TEXT: if debug: print >>sys.stderr, "TEXT", toktext, (linkdest, arglist, macroname) text = None if arglist is None and macroname is None: # We only muck with quote state in here because macro n # ames/args don't affect smart quote behavior. last_quote = None # These are the punct that we expect an open quote could # immediately follow. open_quote_next = (toktext in ('-', '/', '(', '[', '{')) if linkdest is None and arglist is None and macroname is None: # Weird text cases. if debug: print >>sys.stderr, 'weird text', toktext, text_bits if (toktext == '-' and len(text_bits) > 0 and text_bits[-1].isdigit()): text = u'\u2013' elif (len(text_bits) > 0 and text_bits[-1] == u'\u2013' and not toktext.isdigit()): if len(text_bits) > 1 and text_bits[-2].isdigit(): text_bits[-1] = '-' #u'\u2014' else: text_bits[-1] = u'\u2014' if text is None: text = toktext elif typ == cl.Token.SingleQuote: if debug: print >>sys.stderr, 'oqn', open_quote_next if open_quote_next and (last_quote is None or last_quote == typ): text = u'\u2018' else: text = u'\u2019' last_quote = typ elif typ == cl.Token.DoubleQuote: if open_quote_next and (last_quote is None or last_quote == typ): text = u'\u201C' else: text = u'\u201D' last_quote = typ elif typ in PUNCT_MAP: open_quote_next = True if linkdest is None and arglist is None and macroname is None: # Weird punct cases. if (typ == cl.Token.EnDash and len(text_bits) > 0 and text_bits[-1] == ' '): if debug: print >>sys.stderr, 'Emdashing' text_bits.pop() typ = cl.Token.EmDash eatspacesnext = True elif (typ == cl.Token.EnDash and len(text_bits) > 0 and not text_bits[-1].isdigit()): if debug: print >>sys.stderr, 'Emdashing2' typ = cl.Token.EmDash eatspacesnext = True text = PUNCT_MAP[typ] if debug: print >>sys.stderr, typ, repr(text) elif typ in SP or (typ in NL and not open_cell): open_quote_next = True if macroname is not None and arglist is None: arglist = [''] arglistquote = None arglistnest = [] continue elif eatspaces: continue elif arglist is not None and not arglistquote and not arglistnest: arglist.append('') continue elif linkdest is None and arglist is None and macroname is None: # Weird space cases. if (len(text_bits) > 0 and text_bits[-1] == u'\u2013' and (len(text_bits) > 1 or typ not in NL)): if debug: print >>sys.stderr, 'Weird space case' text_bits[-1] = u'\u2014' continue text = toktext elif typ == cl.Token.Escape: txt = toktext assert len(txt) == 2, repr(txt) if (txt[1].isalnum() and (linkdest is not None or implicit_link)): text = txt elif txt[1].isdigit(): text = TILDE_OP + txt[1] elif txt[1].isalpha(): text = txt else: text = txt[1] elif typ == cl.Token.EscapedImplicitInfix: text = toktext[1:] elif typ == cl.Token.LinkImplicitInfix: if text_bits and text_bits[-1] in ('http', 'https'): implicit_link = text_bits.pop() if text_bits: yield Text(''.join(text_bits)) text_bits = [] continue else: text = toktext else: eatspaces = False last_quote = None # Not a text node or something infix, so flush text. if typ in SKIP_WHITESPACE_BEFORE: while text_bits and not text_bits[-1].strip(): text_bits.pop() if text_bits: yield Text(''.join(text_bits)) text_bits = [] text = None if (typ in NL + (cl.Token.EOF, cl.Token.ParaBreak) or (typ == cl.Token.Macro.Start.Close and stack[-1][0] == TABLE_ROW)) and open_cell: if open_cell is not True: yield End(open_cell) s, a = stack.pop() assert s == TABLE_ROW, (s, a, stack) yield End(TABLE_ROW) open_cell = None if typ in NL: continue if typ in FLUSH_STACK_BEFORE: # If an environment ender, flush stack. #print >>sys.stderr, 'flushing', stack, 'before', typ while stack: if stack[-1][0] == MACRO: break esty, earg = stack.pop() #yield End(esty, earg) if typ == cl.Token.EOF: assert macroname is None, macroname break elif typ == cl.Token.ParaBreak: yield Entity(ENV_BREAK) open_quote_next = True elif typ in TOGGLE_MAP: sty = TOGGLE_MAP[typ] if stack and stack[-1][0] == sty: yield End(sty, stack[-1][1]) stack.pop() else: stack.append([sty, None]) yield Start(sty) elif typ in OPEN_MAP: sty = OPEN_MAP[typ] stack.append([sty, None]) yield Start(sty) elif typ == cl.Token.Heading: lev = len(toktext.strip()) stack.append([HEADING, lev]) yield Start(HEADING, lev) eatspaces = True open_quote_next = True elif typ in CLOSE_MAP: sty = CLOSE_MAP[typ] if stack and stack[-1][0] == sty: yield End(sty, stack[-1][1]) stack.pop() else: if debug: print >>sys.stderr, 'error!', sty, stack yield gen_error(toktext) elif typ in ENTITY_MAP: yield Entity(ENTITY_MAP[typ]) elif typ in LIST_MAP: item = LIST_MAP[typ] depth = len(toktext.strip()) if item is not None: stack.append([item, depth]) yield Start(item, depth) eatspaces = True elif typ in TABLE_MAP: if open_cell: yield End(open_cell) else: stack.append([TABLE_ROW, None]) yield Start(TABLE_ROW) cell = TABLE_MAP[typ] if typ == cl.Token.TableCell: # Delay because we drop it at end of line delayed_token = cell open_cell = True else: yield Start(cell) open_cell = cell eatspaces = True elif typ in LINK_START_MAP: stack.append([LINK_START_MAP[typ], None]) linkdest = '' elif typ == cl.Token.LinkPipe: assert linkdest is not None yield Start(stack[-1][0], linkdest) stack[-1][1] = linkdest linkdest = None elif typ in LINK_END_MAP: sty = LINK_END_MAP[typ] if stack and stack[-1][0] == sty: if linkdest is None: # pipe already hit yield End(sty, stack[-1][1]) else: yield Entity(sty, linkdest) linkdest = None stack.pop() else: if debug: print >>sys.stderr, 'lem error!', sty, stack yield gen_error(toktext) elif typ == cl.Token.Macro.Start.Open: macroname = '' macroend = False elif typ == cl.Token.Macro.Start.Close: macroname = '' macroend = True elif typ == cl.Token.Macro.End.Inline: if macroend: assert not arglist, (macroname, arglist) if stack and stack[-1] == [MACRO, macroname]: yield End(MACRO, macroname) stack.pop() else: if debug: print >>sys.stderr, 'merror!', stack yield gen_error('<>' % macroname) else: stack.append([MACRO, macroname]) assert macroname is not None yield Start(MACRO, (macroname, arglist)) macroname = None arglist = None elif typ in (cl.Token.Macro.End.Entity, cl.Token.Macro.End.Broken): yield Entity(MACRO, (macroname, arglist)) macroname = None arglist = None open_quote_next = False elif typ in (cl.Token.Macro.TeX.Open, cl.Token.Macro.TeX.ContentOpen): assert macroname is None if typ == cl.Token.Macro.TeX.Open: assert toktext.startswith('\\') and toktext.endswith('{') mn = toktext[1:-1] else: assert toktext.startswith('{\\') if toktext.endswith(' '): mn = toktext[2:-1] else: mn = toktext[2:] stack.append([MACRO, mn]) yield Start(MACRO, (mn, None)) open_quote_next = True elif typ == cl.Token.Macro.TeX.OptArgOpen: assert macroname is None assert toktext.startswith('\\') and toktext.endswith('[') macroname = toktext[1:-1] arglist = [''] arglistquote = True arglistnest = [] elif typ == cl.Token.Macro.TeX.OptArgBreak: arglist.append('') elif typ == cl.Token.Macro.TeX.EndOptArgs: stack.append([MACRO, macroname]) yield Start(MACRO, (macroname, arglist)) macroname = None arglist = None elif typ == cl.Token.Macro.TeX.OptArgClose: yield Entity(MACRO, (macroname, arglist)) macroname = None arglist = None open_quote_next = False elif typ == cl.Token.Macro.TeX.Entity: assert toktext.startswith('\\') if toktext.endswith(' '): entity = Entity(MACRO, (toktext[1:-1], None)) else: entity = Entity(MACRO, (toktext[1:], None)) if macroname is not None: # \foo[\bar] case arglist.append(entity) else: yield entity open_quote_next = False elif typ == cl.Token.Macro.TeX.SingleChar: assert macroname is None assert arglist is None assert len(toktext) == 3 and toktext.startswith('\\') assert toktext[1] != '\n', markup yield Start(MACRO, (toktext[1], None)) yield Text(toktext[2]) yield End(MACRO, toktext[1]) open_quote_next = False elif typ == cl.Token.Macro.TeX.Close: if debug: print >>sys.stderr, "TEX_MACRO_END", stack if stack and stack[-1][0] == MACRO: sty, name = stack.pop() yield End(MACRO, name) else: yield gen_error(toktext) open_quote_next = False elif typ == cl.Token.Macro.TeX.ArgBreak: open_quote_next = True yield Entity(MACRO, ('break', None)) else: assert typ in IGNORE, (typ, toktext) continue assert text is not None # Text node handling. if eatspacesnext: eatspaces = True eatspacesnext = False else: eatspaces = False if linkdest is not None: linkdest += text elif arglist is not None: if arglistquote is True: # TeX-style pass elif arglistnest and text == arglistnest[-1]: arglistnest.pop() elif text in GROUPING: arglistnest.append(GROUPING[text]) elif arglistquote: if text == arglistquote: arglistquote = None elif text in QUOTES: arglistquote = text arglist[-1] += text elif macroname is not None: macroname += text else: text_bits.append(text) if debug: print >>sys.stderr, text_bits