import sys import antlr3 import Creole from tokens import * TOGGLE_MAP = {Creole.BOLD: BOLD, Creole.ITALIC: ITALIC, Creole.MONOSPACE: MONOSPACE, Creole.SUPERSCRIPT: SUPERSCRIPT, Creole.SUBSCRIPT: SUBSCRIPT, Creole.UNDERLINE: UNDERLINE, #Creole.STRIKE: STRIKE, } OPEN_MAP = {Creole.OPEN_CODEBLOCK: CODEBLOCK, Creole.OPEN_CODE: MONOSPACE} CLOSE_MAP = {Creole.CLOSE_CODEBLOCK: CODEBLOCK, Creole.CLOSE_CODE: MONOSPACE} ENTITY_MAP = {Creole.HRULE: HRULE, Creole.LINEBREAK: LINEBREAK} LINK_START_MAP = {Creole.LINK_START: LINK, Creole.IMAGE_START: IMAGE} LINK_END_MAP = {Creole.LINK_END: LINK, Creole.IMAGE_END: IMAGE} LIST_MAP = {Creole.UNORDERED_LIST_START: UNORDERED_ITEM, Creole.UNORDERED_LIST: UNORDERED_ITEM, Creole.ORDERED_LIST_START: ORDERED_ITEM, Creole.ORDERED_LIST: ORDERED_ITEM, Creole.BLOCKQUOTE: BLOCKQUOTE_LINE} TABLE_MAP = {Creole.TABLE_HEADING: TABLE_HEADING, Creole.TABLE_CELL: TABLE_CELL} PUNCT_MAP = {Creole.EM_DASH: u'\u2014', Creole.EN_DASH: u'\u2013', Creole.OPEN_QUOTE: u'\u201C', Creole.CLOSE_QUOTE: u'\u201D', Creole.OPEN_SINGLE_QUOTE: u'\u2018', Creole.CLOSE_SINGLE_QUOTE: u'\u2019', Creole.ELLIPSIS: u'\u2026', Creole.DOUBLE_BOTH: u'\u21D4', Creole.SINGLE_BOTH: u'\u2194', Creole.DOUBLE_RIGHT: u'\u21D2', Creole.SINGLE_RIGHT: u'\u2192', Creole.DOUBLE_LEFT: u'\u21D0', Creole.SINGLE_LEFT: u'\u2190', } TEXT = set((Creole.TEXT, Creole.PUNCT, Creole.MACRO_QUOTE, Creole.MACRO_END_QUOTE)) FLUSH_STACK_BEFORE = set(LIST_MAP.keys() + [Creole.EOF, Creole.PARABREAK, Creole.HEADING, Creole.CLOSE_MACRO_TAG_START]) SKIP_WHITESPACE_BEFORE = set(LIST_MAP.keys() + TABLE_MAP.keys() + [Creole.HEADING, Creole.EOF]) def tokenize(markup): char_stream = antlr3.ANTLRStringStream(markup) lexer = Creole.Creole(char_stream) text_bits = [] stack = [] linkdest = None macroname = None argstr = None eatspaces = False implicit_link = False open_cell = None # Delayed until we see something other than eaten spaces, dropped if # that's a newline or EOF delayed_token = None seen_open_single = False while True: t = lexer.nextToken() #print >>sys.stderr, (t.getType(), t.getText()) typ = t.getType() if (stack and stack[-1][0] == HEADING and typ in (Creole.NL, Creole.PARABREAK, Creole.EOF)): while text_bits and not text_bits[-1].strip(): text_bits.pop() while text_bits and text_bits[-1] == '=': text_bits.pop() while text_bits and not text_bits[-1].strip(): text_bits.pop() if text_bits: yield Text(''.join(text_bits)) text_bits = [] yield End(HEADING, stack[-1][1]) stack.pop() if implicit_link and typ not in TEXT and typ != Creole.ESCAPE: path = ''.join(text_bits) text_bits = [] if not path[-1].isalnum(): text_bits.append(path[-1]) path = path[:-1] yield Entity(LINK, implicit_link + '://' + path) implicit_link = False continue if delayed_token is not None and typ != Creole.SP: if typ != Creole.NL and typ != Creole.EOF: yield Start(delayed_token) open_cell = delayed_token delayed_token = None if typ in TEXT: text = t.getText() elif typ == Creole.CLOSE_SINGLE_QUOTE and not seen_open_single: # Hack to treat ' as ' if we haven't seen a ` text = t.getText() elif typ in PUNCT_MAP: if typ == Creole.OPEN_SINGLE_QUOTE: seen_open_single = True text = PUNCT_MAP[typ] elif typ == Creole.SP or (typ == Creole.NL and not open_cell): if macroname is not None and argstr is None: argstr = '' continue elif eatspaces: continue else: text = t.getText() elif typ == Creole.ESCAPE: txt = t.getText() assert len(txt) == 2 if (txt[1].isalnum() and (linkdest is not None or implicit_link)): text = txt else: text = txt[1] elif typ == Creole.LINK_IMPLICIT_INFIX: if text_bits and text_bits[-1] in ('http', 'https'): implicit_link = text_bits.pop() if text_bits: yield Text(''.join(text_bits)) text_bits = [] continue else: text = t.getText() else: eatspaces = False # Not a text node or something infix, so flush text. if typ in SKIP_WHITESPACE_BEFORE: while text_bits and not text_bits[-1].strip(): text_bits.pop() if text_bits: yield Text(''.join(text_bits)) text_bits = [] text = None if (typ in (Creole.NL, Creole.EOF) or (typ == Creole.CLOSE_MACRO_TAG_START and stack[-1][0] == TABLE_ROW)) and open_cell: if open_cell is not True: yield End(open_cell) s, a = stack.pop() assert s == TABLE_ROW yield End(TABLE_ROW) open_cell = None if typ == Creole.NL: continue if typ in FLUSH_STACK_BEFORE: # If an environment ender, flush stack. #print >>sys.stderr, 'flushing', stack, 'before', typ while stack: if stack[-1][0] == MACRO: break esty, earg = stack.pop() yield End(esty, earg) if typ == Creole.EOF: break elif typ == Creole.PARABREAK: yield Entity(ENV_BREAK) elif typ in TOGGLE_MAP: sty = TOGGLE_MAP[typ] if stack and stack[-1][0] == sty: yield End(sty, stack[-1][1]) stack.pop() else: stack.append([sty, None]) yield Start(sty) elif typ in OPEN_MAP: sty = OPEN_MAP[typ] stack.append([sty, None]) yield Start(sty) elif typ == Creole.HEADING: lev = len(t.getText().strip()) stack.append([HEADING, lev]) yield Start(HEADING, lev) eatspaces = True elif typ in CLOSE_MAP: sty = CLOSE_MAP[typ] if stack and stack[-1][0] == sty: yield End(sty, stack[-1][1]) stack.pop() else: #print >>sys.stderr, 'error!', sty, stack yield Error(t.getText()) elif typ in ENTITY_MAP: yield Entity(ENTITY_MAP[typ]) elif typ in LIST_MAP: item = LIST_MAP[typ] depth = len(t.getText().strip()) if item is not None: stack.append([item, depth]) yield Start(item, depth) eatspaces = True elif typ in TABLE_MAP: if open_cell: yield End(open_cell) else: stack.append([TABLE_ROW, None]) yield Start(TABLE_ROW) cell = TABLE_MAP[typ] if typ == Creole.TABLE_CELL: # Delay because we drop it at end of line delayed_token = cell open_cell = True else: yield Start(cell) open_cell = cell eatspaces = True elif typ in LINK_START_MAP: stack.append([LINK_START_MAP[typ], None]) linkdest = '' elif typ == Creole.LINK_PIPE: assert linkdest is not None yield Start(stack[-1][0], linkdest) stack[-1][1] = linkdest linkdest = None elif typ in LINK_END_MAP: sty = LINK_END_MAP[typ] if stack and stack[-1][0] == sty: if linkdest is None: # pipe already hit yield End(sty, stack[-1][1]) else: yield Entity(sty, linkdest) linkdest = None stack.pop() else: yield Error(t.getText()) elif typ == Creole.OPEN_MACRO_TAG_START: macroname = '' macroend = False elif typ == Creole.CLOSE_MACRO_TAG_START: macroname = '' macroend = True elif typ == Creole.INLINE_MACRO_TAG_END: if macroend: assert not argstr if stack and stack[-1] == [MACRO, macroname]: yield End(MACRO, macroname) stack.pop() else: #print >>sys.stderr, 'merror!', stack yield Error('<>' % macroname) else: stack.append([MACRO, macroname]) yield Start(MACRO, (macroname, argstr)) macroname = None argstr = None elif typ == Creole.ENTITY_MACRO_TAG_END: yield Entity(MACRO, (macroname, argstr)) macroname = None argstr = None elif typ == Creole.TEX_MACRO_START: macroname = '' elif typ == Creole.TEX_MACRO_START_CONTENT: stack.append([MACRO, macroname]) yield Start(MACRO, (macroname, None)) macroname = None elif typ == Creole.TEX_MACRO_END: if stack and stack[-1][0] == MACRO: sty, name = stack.pop() yield End(MACRO, name) else: yield Error(t.getText()) else: assert False, (typ, t.getText()) continue assert text is not None # Text node handling. eatspaces = False if linkdest is not None: linkdest += text elif argstr is not None: argstr += text elif macroname is not None: macroname += text else: text_bits.append(text)