Line data Source code
1 : /* 2 : Mosh: the mobile shell 3 : Copyright 2012 Keith Winstein 4 : 5 : This program is free software: you can redistribute it and/or modify 6 : it under the terms of the GNU General Public License as published by 7 : the Free Software Foundation, either version 3 of the License, or 8 : (at your option) any later version. 9 : 10 : This program is distributed in the hope that it will be useful, 11 : but WITHOUT ANY WARRANTY; without even the implied warranty of 12 : MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 : GNU General Public License for more details. 14 : 15 : You should have received a copy of the GNU General Public License 16 : along with this program. If not, see <http://www.gnu.org/licenses/>. 17 : 18 : In addition, as a special exception, the copyright holders give 19 : permission to link the code of portions of this program with the 20 : OpenSSL library under certain conditions as described in each 21 : individual source file, and distribute linked combinations including 22 : the two. 23 : 24 : You must obey the GNU General Public License in all respects for all 25 : of the code used other than OpenSSL. If you modify file(s) with this 26 : exception, you may extend this exception to your version of the 27 : file(s), but you are not obligated to do so. If you do not wish to do 28 : so, delete this exception statement from your version. If you delete 29 : this exception statement from all source files in the program, then 30 : also delete it here. 31 : */ 32 : 33 : #include <assert.h> 34 : #include <typeinfo> 35 : #include <errno.h> 36 : #include <wchar.h> 37 : #include <stdint.h> 38 : 39 : #include "parser.h" 40 : 41 : const Parser::StateFamily Parser::family; 42 : 43 72419091 : static void append_or_delete( Parser::ActionPointer act, 44 : Parser::Actions &vec ) 45 : { 46 72419091 : assert( act ); 47 : 48 72419091 : if ( !act->ignore() ) { 49 72189527 : vec.push_back( act ); 50 : } 51 72419091 : } 52 : 53 72189527 : void Parser::Parser::input( wchar_t ch, Actions &ret ) 54 : { 55 72189527 : Transition tx = state->input( ch ); 56 : 57 72189527 : if ( tx.next_state != NULL ) { 58 229564 : append_or_delete( state->exit(), ret ); 59 : } 60 : 61 144379054 : append_or_delete( tx.action, ret ); 62 : 63 72189527 : if ( tx.next_state != NULL ) { 64 114782 : append_or_delete( tx.next_state->enter(), ret ); 65 114782 : state = tx.next_state; 66 : } 67 72189527 : } 68 : 69 12102 : Parser::UTF8Parser::UTF8Parser() 70 12102 : : parser(), buf_len( 0 ) 71 : { 72 12102 : assert( BUF_SIZE >= (size_t)MB_CUR_MAX ); 73 12102 : buf[0] = '\0'; 74 12102 : } 75 : 76 72190721 : void Parser::UTF8Parser::input( char c, Actions &ret ) 77 : { 78 72190721 : assert( buf_len < BUF_SIZE ); 79 : 80 : /* 1-byte UTF-8 character, aka ASCII? Cheat. */ 81 72190721 : if ( buf_len == 0 && static_cast<unsigned char>(c) <= 0x7f ) { 82 72188333 : parser.input( static_cast<wchar_t>(c), ret ); 83 72188333 : return; 84 : } 85 : 86 2388 : buf[ buf_len++ ] = c; 87 : 88 : /* This function will only work in a UTF-8 locale. */ 89 2388 : wchar_t pwc; 90 2388 : mbstate_t ps = mbstate_t(); 91 : 92 2388 : size_t total_bytes_parsed = 0; 93 2388 : size_t orig_buf_len = buf_len; 94 : 95 : /* this routine is somewhat complicated in order to comply with 96 : Unicode 6.0, section 3.9, "Best Practices for using U+FFFD" */ 97 : 98 4776 : while ( total_bytes_parsed != orig_buf_len ) { 99 2388 : assert( total_bytes_parsed < orig_buf_len ); 100 2388 : assert( buf_len > 0 ); 101 2388 : size_t bytes_parsed = mbrtowc( &pwc, buf, buf_len, &ps ); 102 : 103 : /* this returns 0 when n = 0! */ 104 : 105 2388 : if ( bytes_parsed == 0 ) { 106 : /* character was NUL, accept and clear buffer */ 107 0 : assert( buf_len == 1 ); 108 0 : buf_len = 0; 109 0 : pwc = L'\0'; 110 0 : bytes_parsed = 1; 111 2388 : } else if ( bytes_parsed == (size_t) -1 ) { 112 : /* invalid sequence, use replacement character and try again with last char */ 113 0 : assert( errno == EILSEQ ); 114 0 : if ( buf_len > 1 ) { 115 0 : buf[ 0 ] = buf[ buf_len - 1 ]; 116 0 : bytes_parsed = buf_len - 1; 117 0 : buf_len = 1; 118 : } else { 119 0 : buf_len = 0; 120 0 : bytes_parsed = 1; 121 : } 122 0 : pwc = (wchar_t) 0xFFFD; 123 2388 : } else if ( bytes_parsed == (size_t) -2 ) { 124 : /* can't parse incomplete multibyte character */ 125 1194 : total_bytes_parsed += buf_len; 126 1194 : continue; 127 : } else { 128 : /* parsed into pwc, accept */ 129 1194 : assert( bytes_parsed <= buf_len ); 130 1194 : memmove( buf, buf + bytes_parsed, buf_len - bytes_parsed ); 131 1194 : buf_len = buf_len - bytes_parsed; 132 : } 133 : 134 : /* Cast to unsigned for checks, because some 135 : platforms (e.g. ARM) use uint32_t as wchar_t, 136 : causing compiler warning on "pwc > 0" check. */ 137 1194 : const uint32_t pwcheck = pwc; 138 : 139 1194 : if ( pwcheck > 0x10FFFF ) { /* outside Unicode range */ 140 0 : pwc = (wchar_t) 0xFFFD; 141 : } 142 : 143 1194 : if ( (pwcheck >= 0xD800) && (pwcheck <= 0xDFFF) ) { /* surrogate code point */ 144 : /* 145 : OS X unfortunately allows these sequences without EILSEQ, but 146 : they are ill-formed UTF-8 and we shouldn't repeat them to the 147 : user's terminal. 148 : */ 149 0 : pwc = (wchar_t) 0xFFFD; 150 : } 151 : 152 1194 : parser.input( pwc, ret ); 153 : 154 1194 : total_bytes_parsed += bytes_parsed; 155 : } 156 : } 157 : 158 20841 : Parser::Parser::Parser( const Parser &other ) 159 20841 : : state( other.state ) 160 20841 : {} 161 : 162 23153 : Parser::Parser & Parser::Parser::operator=( const Parser &other ) 163 : { 164 23153 : state = other.state; 165 23153 : return *this; 166 : }