| 1 | o (Revision: 05d3cd9ed57a8fbd38f78f044702aa7f277d8054) |
|---|
| 2 | |\ |
|---|
| 3 | | o ----------------------------------------------------------------- |
|---|
| 4 | | | Revision: 7088cb4c44ba3bb518469b84a708a10dfe4dfd6f |
|---|
| 5 | | | Ancestor: ef71051efe4a5691a68b07480ccd7af2a6c83abb |
|---|
| 6 | | | Author: graydon@pobox.com |
|---|
| 7 | | | Date: 2007-05-19T01:11:05 |
|---|
| 8 | | | Branch: com.mozilla.es4.smlnj |
|---|
| 9 | | | |
|---|
| 10 | | | Modified files: |
|---|
| 11 | | | builtins/Conversions.es builtins/RegExpCompiler.es eval.sml |
|---|
| 12 | | | |
|---|
| 13 | | | ChangeLog: |
|---|
| 14 | | | |
|---|
| 15 | | | fixes to regexp and comparison algorithm |
|---|
| 16 | | | |
|---|
| 17 | | | ============================================================ |
|---|
| 18 | | | --- builtins/RegExpCompiler.es dffbf10464ef5f5c9ff24d0dce7ad25955b52eea |
|---|
| 19 | | | +++ builtins/RegExpCompiler.es 0290d0d8137031ae7a7770fb7104305edf414c5e |
|---|
| 20 | | | @@ -260,8 +260,8 @@ package RegExpInternals |
|---|
| 21 | | | case 0x3Fu /* "?" */: |
|---|
| 22 | | | case 0x7Bu /* "{" */: |
|---|
| 23 | | | case 0x7Cu /* "|" */: |
|---|
| 24 | | | - case 0x7Du /* "}" */: |
|---|
| 25 | | | - case 0x5Du /* "]" */: |
|---|
| 26 | | | + // case 0x7Du /* "}" */: |
|---|
| 27 | | | + // case 0x5Du /* "]" */: |
|---|
| 28 | | | return null; |
|---|
| 29 | | | |
|---|
| 30 | | | default: |
|---|
| 31 | | | @@ -473,7 +473,7 @@ package RegExpInternals |
|---|
| 32 | | | switch (peekCharCode()) { |
|---|
| 33 | | | |
|---|
| 34 | | | case 0x64u /* "d" */: advance(); return charset_digit; |
|---|
| 35 | | | - case 0x44u /* "D" */: advance(); return charset_nondigit; |
|---|
| 36 | | | + case 0x44u /* "D" */: advance(); return charset_notdigit; |
|---|
| 37 | | | case 0x73u /* "s" */: advance(); return charset_space; |
|---|
| 38 | | | case 0x53u /* "S" */: advance(); return charset_notspace; |
|---|
| 39 | | | case 0x77u /* "w" */: advance(); return charset_word; |
|---|
| 40 | | o (Revision: ef71051efe4a5691a68b07480ccd7af2a6c83abb) |
|---|
| 41 | |/| |
|---|
| 42 | | o ----------------------------------------------------------------- |
|---|
| 43 | | | Revision: ea2fb5cd1ba12ce935736823136f5e03b80e4131 |
|---|
| 44 | | | Ancestor: db5745bc1bcb0c59dbbb4d33a5cafa266519d5ca |
|---|
| 45 | | | Author: graydon@pobox.com |
|---|
| 46 | | | Date: 2007-05-18T22:11:14 |
|---|
| 47 | | | Branch: com.mozilla.es4.smlnj |
|---|
| 48 | | | |
|---|
| 49 | | | Modified files: |
|---|
| 50 | | | boot.sml builtins/RegExpCompiler.es builtins/Unicode.es |
|---|
| 51 | | | eval.sml |
|---|
| 52 | | | |
|---|
| 53 | | | ChangeLog: |
|---|
| 54 | | | |
|---|
| 55 | | | optimize regexp compiler, expand caches |
|---|
| 56 | | | |
|---|
| 57 | | | ============================================================ |
|---|
| 58 | | | --- builtins/RegExpCompiler.es 23de98402dc1ccd7cc1fc5d78102ac636e1d86f2 |
|---|
| 59 | | | +++ builtins/RegExpCompiler.es dffbf10464ef5f5c9ff24d0dce7ad25955b52eea |
|---|
| 60 | | | @@ -22,6 +22,7 @@ package RegExpInternals |
|---|
| 61 | | | /* Invariant for token handling: either idx==source.length or source[idx] is a significant char */ |
|---|
| 62 | | | |
|---|
| 63 | | | var source : string; // expression source, sans leading and trailing / // FIXME: const |
|---|
| 64 | | | + var slen : uint; // source length, retrieved once |
|---|
| 65 | | | var idx : uint; // current character in the source |
|---|
| 66 | | | var largest_backref : uint; // largest back reference seen |
|---|
| 67 | | | var extended : boolean; // true iff expression has /x flag // FIXME: const |
|---|
| 68 | | | @@ -32,6 +33,7 @@ package RegExpInternals |
|---|
| 69 | | | function RegExpCompiler( source : string, flags ) |
|---|
| 70 | | | : extended = flags.x |
|---|
| 71 | | | , source = source |
|---|
| 72 | | | + , slen = source.length |
|---|
| 73 | | | , idx = 0 /* FIXME: redundant */ |
|---|
| 74 | | | , largest_backref = 0 /* FIXME: redundant */ |
|---|
| 75 | | | { |
|---|
| 76 | | | @@ -40,7 +42,7 @@ package RegExpInternals |
|---|
| 77 | | | |
|---|
| 78 | | | public function compile() : RegExpMatcher { |
|---|
| 79 | | | let p : Matcher = pattern(); |
|---|
| 80 | | | - if (idx !== source.length) |
|---|
| 81 | | | + if (idx !== slen) |
|---|
| 82 | | | fail( SyntaxError, "Invalid character in input \"" + source + "\", position " + idx ); |
|---|
| 83 | | | if (largest_backref > parenIndex && largest_backref > 0) |
|---|
| 84 | | | fail( SyntaxError, "Reference to undefined capture " + largest_backref ); |
|---|
| 85 | | | @@ -55,9 +57,10 @@ package RegExpInternals |
|---|
| 86 | | | let alt : Matcher = alternative(); |
|---|
| 87 | | | if (alt == null) |
|---|
| 88 | | | return new Empty; |
|---|
| 89 | | | - if (eat("|")) |
|---|
| 90 | | | + if (peekCharCode() == 0x7Cu /* "|" */) { |
|---|
| 91 | | | + advance(); |
|---|
| 92 | | | return new Disjunct(alt, disjunction()); |
|---|
| 93 | | | - else |
|---|
| 94 | | | + } else |
|---|
| 95 | | | return alt; |
|---|
| 96 | | | } |
|---|
| 97 | | | |
|---|
| 98 | | | @@ -86,11 +89,23 @@ package RegExpInternals |
|---|
| 99 | | | } |
|---|
| 100 | | | |
|---|
| 101 | | | function assertion() : Matcher? { |
|---|
| 102 | | | - if (eat("^")) return new AssertStartOfInput; |
|---|
| 103 | | | - else if (eat("$")) return new AssertEndOfInput; |
|---|
| 104 | | | - else if (eat("\\b")) return new AssertWordBoundary; |
|---|
| 105 | | | - else if (eat("\\B")) return new AssertNotWordBoundary; |
|---|
| 106 | | | - else return null; |
|---|
| 107 | | | + switch (peekCharCode()) { |
|---|
| 108 | | | + |
|---|
| 109 | | | + case 0x5Eu /* "^" */ : |
|---|
| 110 | | | + advance(); |
|---|
| 111 | | | + return new AssertStartOfInput; |
|---|
| 112 | | | + |
|---|
| 113 | | | + case 0x24u /* "$" */ : |
|---|
| 114 | | | + advance(); |
|---|
| 115 | | | + return new AssertEndOfInput; |
|---|
| 116 | | | + |
|---|
| 117 | | | + case 0x5Cu /* "\\" */: |
|---|
| 118 | | | + if (eat("\\b")) return new AssertWordBoundary; |
|---|
| 119 | | | + else if (eat("\\B")) return new AssertNotWordBoundary; |
|---|
| 120 | | | + |
|---|
| 121 | | | + default: |
|---|
| 122 | | | + return null; |
|---|
| 123 | | | + } |
|---|
| 124 | | | } |
|---|
| 125 | | | |
|---|
| 126 | | | function quantifier() : [double,double,boolean]? { |
|---|
| 127 | | | @@ -98,127 +113,160 @@ package RegExpInternals |
|---|
| 128 | | | if (x == null) |
|---|
| 129 | | | return x; |
|---|
| 130 | | | let [min,max] : [double,double] = x; |
|---|
| 131 | | | - let greedy : boolean = !eat("?"); |
|---|
| 132 | | | + let greedy : boolean = true; |
|---|
| 133 | | | + if (peekCharCode() == 0x3Fu /* "?" */) { |
|---|
| 134 | | | + greedy = false; |
|---|
| 135 | | | + advance(); |
|---|
| 136 | | | + } |
|---|
| 137 | | | return [min,max,greedy]; |
|---|
| 138 | | | } |
|---|
| 139 | | | |
|---|
| 140 | | | + static const star = [0,Infinity]; |
|---|
| 141 | | | + static const plus = [1,Infinity]; |
|---|
| 142 | | | + static const ques = [0,1]; |
|---|
| 143 | | | + |
|---|
| 144 | | | function quantifierPrefix() : [double, double]? { |
|---|
| 145 | | | - if (eat("*")) return [0,Infinity]; |
|---|
| 146 | | | - else if (eat("+")) return [1,Infinity]; |
|---|
| 147 | | | - else if (eat("?")) return [0,1]; |
|---|
| 148 | | | - else if (eat("{")) { |
|---|
| 149 | | | - let min : double = decimalDigits(); |
|---|
| 150 | | | - let max : double = min; |
|---|
| 151 | | | - if (eat(",")) { |
|---|
| 152 | | | - if (eat("}")) |
|---|
| 153 | | | - max = Infinity; |
|---|
| 154 | | | - else { |
|---|
| 155 | | | - max = decimalDigits(); |
|---|
| 156 | | | + switch (peekCharCode()) { |
|---|
| 157 | | | + case 0x2Au /* "*" */: |
|---|
| 158 | | | + advance(); |
|---|
| 159 | | | + return star; |
|---|
| 160 | | | + |
|---|
| 161 | | | + case 0x2Bu /* "+" */: |
|---|
| 162 | | | + advance(); |
|---|
| 163 | | | + return plus; |
|---|
| 164 | | | + |
|---|
| 165 | | | + case 0x3Fu /* "?" */: |
|---|
| 166 | | | + advance(); |
|---|
| 167 | | | + return ques; |
|---|
| 168 | | | + |
|---|
| 169 | | | + case 0x7Bu /* "{" */: |
|---|
| 170 | | | + advance(); |
|---|
| 171 | | | + { |
|---|
| 172 | | | + let min : double = decimalDigits(); |
|---|
| 173 | | | + let max : double = min; |
|---|
| 174 | | | + if (eat(",")) { |
|---|
| 175 | | | + if (eat("}")) |
|---|
| 176 | | | + max = Infinity; |
|---|
| 177 | | | + else { |
|---|
| 178 | | | + max = decimalDigits(); |
|---|
| 179 | | | + match("}"); |
|---|
| 180 | | | + } |
|---|
| 181 | | | + } else { |
|---|
| 182 | | | match("}"); |
|---|
| 183 | | | } |
|---|
| 184 | | | - } else { |
|---|
| 185 | | | - match("}"); |
|---|
| 186 | | | + if (isFinite(max) && max < min) |
|---|
| 187 | | | + fail( SyntaxError, "max quant must be at least as large as min" ); |
|---|
| 188 | | | + return [min,max]; |
|---|
| 189 | | | } |
|---|
| 190 | | | - if (isFinite(max) && max < min) |
|---|
| 191 | | | - fail( SyntaxError, "max quant must be at least as large as min" ); |
|---|
| 192 | | | - return [min,max]; |
|---|
| 193 | | | - } |
|---|
| 194 | | | - else |
|---|
| 195 | | | + |
|---|
| 196 | | | + default: |
|---|
| 197 | | | return null; |
|---|
| 198 | | | + } |
|---|
| 199 | | | } |
|---|
| 200 | | | |
|---|
| 201 | | | function atom() : Matcher? { |
|---|
| 202 | | | if (atEnd()) |
|---|
| 203 | | | return null; |
|---|
| 204 | | | |
|---|
| 205 | | | - if (lookingAt(")")) |
|---|
| 206 | | | + switch (peekCharCode()) { |
|---|
| 207 | | | + case 0x29u /* ")" */: |
|---|
| 208 | | | return null; |
|---|
| 209 | | | |
|---|
| 210 | | | - if (eat(".")) |
|---|
| 211 | | | + case 0x2Eu /* "." */: |
|---|
| 212 | | | + advance(); |
|---|
| 213 | | | return new CharsetMatcher(charset_notlinebreak); |
|---|
| 214 | | | |
|---|
| 215 | | | - if (eat("(?:")) { |
|---|
| 216 | | | - let d : Matcher = disjunction(); |
|---|
| 217 | | | - match(")"); |
|---|
| 218 | | | - return d; |
|---|
| 219 | | | - } |
|---|
| 220 | | | - |
|---|
| 221 | | | - if (eat("(?=")) { |
|---|
| 222 | | | - let d : Matcher = disjunction(); |
|---|
| 223 | | | - match(")"); |
|---|
| 224 | | | - intrinsic::assert(d !== null); |
|---|
| 225 | | | - return new PositiveLookahead(d); |
|---|
| 226 | | | - } |
|---|
| 227 | | | - |
|---|
| 228 | | | - if (eat("(?!")) { |
|---|
| 229 | | | - let d : Matcher = disjunction(); |
|---|
| 230 | | | - match(")"); |
|---|
| 231 | | | - return new NegativeLookahead(d); |
|---|
| 232 | | | - } |
|---|
| 233 | | | - |
|---|
| 234 | | | - if (eat("(?#")) { |
|---|
| 235 | | | - consumeUntil(")"); |
|---|
| 236 | | | - match(")"); |
|---|
| 237 | | | - return new Empty; |
|---|
| 238 | | | - } |
|---|
| 239 | | | - |
|---|
| 240 | | | - if (eat("(?P<")) { |
|---|
| 241 | | | - let name : string = identifier(); |
|---|
| 242 | | | - match(">"); |
|---|
| 243 | | | + case 0x28u /* "(" */: |
|---|
| 244 | | | + advance(); |
|---|
| 245 | | | + |
|---|
| 246 | | | + if (peekCharCode() == 0x3Fu /* "?" */) { |
|---|
| 247 | | | + advance(); |
|---|
| 248 | | | + switch (peekChar()) { |
|---|
| 249 | | | + case ":": |
|---|
| 250 | | | + advance(); |
|---|
| 251 | | | + let d : Matcher = disjunction(); |
|---|
| 252 | | | + match(")"); |
|---|
| 253 | | | + return d; |
|---|
| 254 | | | + |
|---|
| 255 | | | + case "=": |
|---|
| 256 | | | + advance(); |
|---|
| 257 | | | + let d : Matcher = disjunction(); |
|---|
| 258 | | | + match(")"); |
|---|
| 259 | | | + intrinsic::assert(d !== null); |
|---|
| 260 | | | + return new PositiveLookahead(d); |
|---|
| 261 | | | + |
|---|
| 262 | | | + case "!": |
|---|
| 263 | | | + advance(); |
|---|
| 264 | | | + let d : Matcher = disjunction(); |
|---|
| 265 | | | + match(")"); |
|---|
| 266 | | | + return new NegativeLookahead(d); |
|---|
| 267 | | | + |
|---|
| 268 | | | + case "#": |
|---|
| 269 | | | + advance(); |
|---|
| 270 | | | + consumeUntil(")"); |
|---|
| 271 | | | + match(")"); |
|---|
| 272 | | | + return new Empty; |
|---|
| 273 | | | + |
|---|
| 274 | | | + case "P": |
|---|
| 275 | | | + advance(); |
|---|
| 276 | | | + if (eat("<")) { |
|---|
| 277 | | | + let name : string = identifier(); |
|---|
| 278 | | | + match(">"); |
|---|
| 279 | | | + let capno : uint = parenIndex++; |
|---|
| 280 | | | + parenCount++; |
|---|
| 281 | | | + let d : Matcher = disjunction(); |
|---|
| 282 | | | + parenCount--; |
|---|
| 283 | | | + match(")"); |
|---|
| 284 | | | + for each ( let n : string in names ) { |
|---|
| 285 | | | + if (n === name) |
|---|
| 286 | | | + fail( SyntaxError, "Multiply defined capture name: " + name ); |
|---|
| 287 | | | + } |
|---|
| 288 | | | + names[capno] = name; |
|---|
| 289 | | | + return new Capturing(d, capno); |
|---|
| 290 | | | + } |
|---|
| 291 | | | + |
|---|
| 292 | | | + if (eat("=")) { |
|---|
| 293 | | | + let name : string = identifier(); |
|---|
| 294 | | | + match(")"); |
|---|
| 295 | | | + for ( let [i,n] : [string,string?] in names ) { |
|---|
| 296 | | | + if (n === name) |
|---|
| 297 | | | + return new Backref(uint(i)); |
|---|
| 298 | | | + } |
|---|
| 299 | | | + fail( SyntaxError, "Unknown backref name " + name ); |
|---|
| 300 | | | + } |
|---|
| 301 | | | + |
|---|
| 302 | | | + default: |
|---|
| 303 | | | + fail( SyntaxError, "Bogus (? pattern" ); |
|---|
| 304 | | | + } |
|---|
| 305 | | | + } // peekChar() != "?" |
|---|
| 306 | | | + |
|---|
| 307 | | | let capno : uint = parenIndex++; |
|---|
| 308 | | | parenCount++; |
|---|
| 309 | | | let d : Matcher = disjunction(); |
|---|
| 310 | | | parenCount--; |
|---|
| 311 | | | match(")"); |
|---|
| 312 | | | - for each ( let n : string in names ) { |
|---|
| 313 | | | - if (n === name) |
|---|
| 314 | | | - fail( SyntaxError, "Multiply defined capture name: " + name ); |
|---|
| 315 | | | - } |
|---|
| 316 | | | - names[capno] = name; |
|---|
| 317 | | | return new Capturing(d, capno); |
|---|
| 318 | | | - } |
|---|
| 319 | | | - |
|---|
| 320 | | | - if (eat("(?P=")) { |
|---|
| 321 | | | - let name : string = identifier(); |
|---|
| 322 | | | - match(")"); |
|---|
| 323 | | | - for ( let [i,n] : [string,string?] in names ) { |
|---|
| 324 | | | - if (n === name) |
|---|
| 325 | | | - return new Backref(uint(i)); |
|---|
| 326 | | | - } |
|---|
| 327 | | | - fail( SyntaxError, "Unknown backref name " + name ); |
|---|
| 328 | | | - } |
|---|
| 329 | | | - |
|---|
| 330 | | | - if (eat("(?")) |
|---|
| 331 | | | - fail( SyntaxError, "Bogus (? pattern" ); |
|---|
| 332 | | | - |
|---|
| 333 | | | - if (eat("(")) { |
|---|
| 334 | | | - let capno : uint = parenIndex++; |
|---|
| 335 | | | - parenCount++; |
|---|
| 336 | | | - let d : Matcher = disjunction(); |
|---|
| 337 | | | - parenCount--; |
|---|
| 338 | | | - match(")"); |
|---|
| 339 | | | - return new Capturing(d, capno); |
|---|
| 340 | | | - } |
|---|
| 341 | | | - |
|---|
| 342 | | | - if (lookingAt("[")) |
|---|
| 343 | | | + |
|---|
| 344 | | | + case 0x5Bu /* "[" */: |
|---|
| 345 | | | return characterClass(); |
|---|
| 346 | | | |
|---|
| 347 | | | - if (lookingAt("\\")) |
|---|
| 348 | | | + case 0x5Cu /* "\\" */: |
|---|
| 349 | | | return atomEscape(); |
|---|
| 350 | | | |
|---|
| 351 | | | - if (lookingAt("^") || |
|---|
| 352 | | | - lookingAt("$") || |
|---|
| 353 | | | - lookingAt("*") || |
|---|
| 354 | | | - lookingAt("+") || |
|---|
| 355 | | | - lookingAt("?") || |
|---|
| 356 | | | - lookingAt("{") || |
|---|
| 357 | | | - lookingAt("}") || |
|---|
| 358 | | | - lookingAt("]") || |
|---|
| 359 | | | - lookingAt("|")) |
|---|
| 360 | | | + case 0x5Eu /* "^" */: |
|---|
| 361 | | | + case 0x24u /* "$" */: |
|---|
| 362 | | | + case 0x2Au /* "*" */: |
|---|
| 363 | | | + case 0x2Bu /* "+" */: |
|---|
| 364 | | | + case 0x3Fu /* "?" */: |
|---|
| 365 | | | + case 0x7Bu /* "{" */: |
|---|
| 366 | | | + case 0x7Cu /* "|" */: |
|---|
| 367 | | | + case 0x7Du /* "}" */: |
|---|
| 368 | | | + case 0x5Du /* "]" */: |
|---|
| 369 | | | return null; |
|---|
| 370 | | | - //fail( SyntaxError, "Illegal character in expression: " + peekChar() ); |
|---|
| 371 | | | - |
|---|
| 372 | | | - return new CharsetMatcher(new CharsetAdhoc(consumeChar())); |
|---|
| 373 | | | + |
|---|
| 374 | | | + default: |
|---|
| 375 | | | + return new CharsetMatcher(new CharsetAdhoc(consumeChar())); |
|---|
| 376 | | | + } |
|---|
| 377 | | | } |
|---|
| 378 | | | |
|---|
| 379 | | | function atomEscape() : Matcher { |
|---|
| 380 | | | @@ -246,8 +294,10 @@ package RegExpInternals |
|---|
| 381 | | | function characterClass() : Matcher { |
|---|
| 382 | | | match("["); |
|---|
| 383 | | | let inverted : boolean = false; |
|---|
| 384 | | | - if (eat("^")) |
|---|
| 385 | | | + if (peekChar() == "^") { |
|---|
| 386 | | | + advance(); |
|---|
| 387 | | | inverted = true; |
|---|
| 388 | | | + } |
|---|
| 389 | | | let ranges : Charset = classRanges(); |
|---|
| 390 | | | match("]"); |
|---|
| 391 | | | return new CharsetMatcher(inverted ? new CharsetComplement(ranges) : ranges); |
|---|
| 392 | | | @@ -362,10 +412,6 @@ package RegExpInternals |
|---|
| 393 | | | ce : function (Charset) : (Matcher,Charset), |
|---|
| 394 | | | ch : function (string) : (Matcher,Charset), |
|---|
| 395 | | | allow_b : boolean ) : (Matcher,Charset) { |
|---|
| 396 | | | - let (t : double? = decimalEscape()) { |
|---|
| 397 | | | - if (t !== null) |
|---|
| 398 | | | - return de(t); |
|---|
| 399 | | | - } |
|---|
| 400 | | | |
|---|
| 401 | | | let (t : Charset? = characterClassEscape()) { |
|---|
| 402 | | | if (t !== null) |
|---|
| 403 | | | @@ -377,6 +423,11 @@ package RegExpInternals |
|---|
| 404 | | | return ch(t); |
|---|
| 405 | | | } |
|---|
| 406 | | | |
|---|
| 407 | | | + let (t : double? = decimalEscape()) { |
|---|
| 408 | | | + if (t !== null) |
|---|
| 409 | | | + return de(t); |
|---|
| 410 | | | + } |
|---|
| 411 | | | + |
|---|
| 412 | | | eat("\\"); |
|---|
| 413 | | | fail( SyntaxError, "Failed to match escape sequence " + peekChar() ); |
|---|
| 414 | | | } |
|---|
| 415 | | | @@ -385,14 +436,16 @@ package RegExpInternals |
|---|
| 416 | | | throws an error if it consumes and then fails. |
|---|
| 417 | | | */ |
|---|
| 418 | | | function decimalEscape() : double? { |
|---|
| 419 | | | - if (lookingAt("\\0") || lookingAt("\\1") || lookingAt("\\2") || lookingAt("\\3") || |
|---|
| 420 | | | - lookingAt("\\4") || lookingAt("\\5") || lookingAt("\\6") || lookingAt("\\7") || |
|---|
| 421 | | | - lookingAt("\\8") || lookingAt("\\9")) { |
|---|
| 422 | | | - consumeChar("\\"); |
|---|
| 423 | | | + if (peekChar() != "\\") |
|---|
| 424 | | | + return null; |
|---|
| 425 | | | + let saved : uint = idx; |
|---|
| 426 | | | + advance(); |
|---|
| 427 | | | + let c : uint = peekCharCode(); |
|---|
| 428 | | | + if (c >= 0x30u && c <= 0x39u) { |
|---|
| 429 | | | return decimalDigits(); |
|---|
| 430 | | | } |
|---|
| 431 | | | - else |
|---|
| 432 | | | - return null; |
|---|
| 433 | | | + idx = saved; |
|---|
| 434 | | | + return null; |
|---|
| 435 | | | } |
|---|
| 436 | | | |
|---|
| 437 | | | /* Returns null if it does not consume anything but fails; |
|---|
| 438 | | | @@ -409,15 +462,37 @@ package RegExpInternals |
|---|
| 439 | | | return cls; |
|---|
| 440 | | | } |
|---|
| 441 | | | |
|---|
| 442 | | | - if (eat("\\d")) return charset_digit; |
|---|
| 443 | | | - if (eat("\\D")) return charset_notdigit; |
|---|
| 444 | | | - if (eat("\\s")) return charset_space; |
|---|
| 445 | | | - if (eat("\\S")) return charset_notspace; |
|---|
| 446 | | | - if (eat("\\w")) return charset_word; |
|---|
| 447 | | | - if (eat("\\W")) return charset_notword; |
|---|
| 448 | | | - if (eat("\\p{")) return unicodeSet(false); |
|---|
| 449 | | | - if (eat("\\P{")) return unicodeSet(true); |
|---|
| 450 | | | + if (peekCharCode() != 0x5Cu /* "\\" */) |
|---|
| 451 | | | + return null; |
|---|
| 452 | | | |
|---|
| 453 | | | + let saved : uint = idx; |
|---|
| 454 | | | + advance(); |
|---|
| 455 | | | + |
|---|
| 456 | | | + let invert : boolean = true; |
|---|
| 457 | | | + |
|---|
| 458 | | | + switch (peekCharCode()) { |
|---|
| 459 | | | + |
|---|
| 460 | | | + case 0x64u /* "d" */: advance(); return charset_digit; |
|---|
| 461 | | | + case 0x44u /* "D" */: advance(); return charset_nondigit; |
|---|
| 462 | | | + case 0x73u /* "s" */: advance(); return charset_space; |
|---|
| 463 | | | + case 0x53u /* "S" */: advance(); return charset_notspace; |
|---|
| 464 | | | + case 0x77u /* "w" */: advance(); return charset_word; |
|---|
| 465 | | | + case 0x57u /* "W" */: advance(); return charset_notword; |
|---|
| 466 | | | + |
|---|
| 467 | | | + case 0x70u /* "p" */: |
|---|
| 468 | | | + invert = false; |
|---|
| 469 | | | + case 0x50u /* "P" */: |
|---|
| 470 | | | + { |
|---|
| 471 | | | + let saved : uint = idx; |
|---|
| 472 | | | + advance(); |
|---|
| 473 | | | + if (peekChar() == "{") { |
|---|
| 474 | | | + advance(); |
|---|
| 475 | | | + return unicodeSet(invert); |
|---|
| 476 | | | + } |
|---|
| 477 | | | + idx = saved; |
|---|
| 478 | | | + } |
|---|
| 479 | | | + } |
|---|
| 480 | | | + idx = saved; |
|---|
| 481 | | | return null; |
|---|
| 482 | | | } |
|---|
| 483 | | | |
|---|
| 484 | | | @@ -451,17 +526,29 @@ package RegExpInternals |
|---|
| 485 | | | return string.fromCharCode(k); |
|---|
| 486 | | | } |
|---|
| 487 | | | |
|---|
| 488 | | | - if (allow_b && eat("\\b")) |
|---|
| 489 | | | - return "\\b"; |
|---|
| 490 | | | - if (eat("\\f")) |
|---|
| 491 | | | - return "\f"; |
|---|
| 492 | | | - if (eat("\\n")) |
|---|
| 493 | | | - return "\n"; |
|---|
| 494 | | | - if (eat("\\r")) |
|---|
| 495 | | | - return "\r"; |
|---|
| 496 | | | - if (eat("\\t")) |
|---|
| 497 | | | - return "\t"; |
|---|
| 498 | | | - if (eat("\\c")) |
|---|
| 499 | | | + let c : uint = peekCharCode(); |
|---|
| 500 | | | + |
|---|
| 501 | | | + if (c != 0x5Cu /* "\\" */) |
|---|
| 502 | | | + return null; |
|---|
| 503 | | | + |
|---|
| 504 | | | + advance(); |
|---|
| 505 | | | + c = peekCharCode(); |
|---|
| 506 | | | + |
|---|
| 507 | | | + switch (c) { |
|---|
| 508 | | | + case 0x62u /* "b" */: |
|---|
| 509 | | | + if (allow_b) { |
|---|
| 510 | | | + advance(); |
|---|
| 511 | | | + return "\\b"; |
|---|
| 512 | | | + } |
|---|
| 513 | | | + break; |
|---|
| 514 | | | + |
|---|
| 515 | | | + case 0x66u /* "f" */: advance(); return "\f"; |
|---|
| 516 | | | + case 0x6Eu /* "n" */: advance(); return "\n"; |
|---|
| 517 | | | + case 0x72u /* "r" */: advance(); return "\r"; |
|---|
| 518 | | | + case 0x74u /* "t" */: advance(); return "\t"; |
|---|
| 519 | | | + |
|---|
| 520 | | | + case 0x63u /* "c" */: |
|---|
| 521 | | | + advance(); |
|---|
| 522 | | | let (c : string = consumeChar()) { |
|---|
| 523 | | | if (c >= "A" && c <= "Z") |
|---|
| 524 | | | return string.fromCharCode(c.charCodeAt(0) - "A".charCodeAt(0)); |
|---|
| 525 | | | @@ -469,20 +556,27 @@ package RegExpInternals |
|---|
| 526 | | | return string.fromCharCode(c.charCodeAt(0) - "a".charCodeAt(0)); |
|---|
| 527 | | | fail( SyntaxError, "Bogus \\c sequence: " + c ); |
|---|
| 528 | | | } |
|---|
| 529 | | | - if (eat("\\x{") || eat("\\X{") || eat("\\u{") || eat("\\U{")) { |
|---|
| 530 | | | - let s : string = hexDigits(); |
|---|
| 531 | | | - match("}"); |
|---|
| 532 | | | - return s; |
|---|
| 533 | | | + |
|---|
| 534 | | | + case 0x77u /* "x" */: |
|---|
| 535 | | | + case 0x57u /* "X" */: |
|---|
| 536 | | | + case 0x75u /* "u" */: |
|---|
| 537 | | | + case 0x55u /* "U" */: |
|---|
| 538 | | | + advance(); |
|---|
| 539 | | | + if (peekCharCode() == 0x7Bu /* "{" */) { |
|---|
| 540 | | | + advance(); |
|---|
| 541 | | | + let s : string = hexDigits(); |
|---|
| 542 | | | + match("}"); |
|---|
| 543 | | | + return s; |
|---|
| 544 | | | + } else if (c == 0x77u /* "x" */ || c == 0x57u /* "X" */) { |
|---|
| 545 | | | + return hexDigits(2); |
|---|
| 546 | | | + } else { |
|---|
| 547 | | | + return hexDigits(4); |
|---|
| 548 | | | + } |
|---|
| 549 | | | } |
|---|
| 550 | | | - if (eat("\\x") || eat("\\X")) |
|---|
| 551 | | | - return hexDigits(2); |
|---|
| 552 | | | - if (eat("\\u") || eat("\\U")) |
|---|
| 553 | | | - return hexDigits(4); |
|---|
| 554 | | | - if (isIdentifierPart(peekChar())) |
|---|
| 555 | | | - return null; |
|---|
| 556 | | | - consumeChar("\\"); |
|---|
| 557 | | | + |
|---|
| 558 | | | if (atEnd()) |
|---|
| 559 | | | fail( SyntaxError, "EOF inside escape sequence" ); |
|---|
| 560 | | | + |
|---|
| 561 | | | return consumeChar(); |
|---|
| 562 | | | } |
|---|
| 563 | | | |
|---|
| 564 | | | @@ -508,20 +602,24 @@ package RegExpInternals |
|---|
| 565 | | | function lookingAt(c : string) : void { |
|---|
| 566 | | | if (atEnd()) |
|---|
| 567 | | | return false; |
|---|
| 568 | | | - for ( let i : uint=0 ; i < c.length && i+idx < source.length ; i++ ) |
|---|
| 569 | | | - if (c[i] !== source[i+idx]) |
|---|
| 570 | | | + let i : uint = 0; |
|---|
| 571 | | | + let j : uint = idx; |
|---|
| 572 | | | + let ilim = i + c.length; |
|---|
| 573 | | | + let jlim = j + slen; |
|---|
| 574 | | | + for ( ; i < ilim && j < jlim ; i++, j++ ) |
|---|
| 575 | | | + if (c[i] !== source[j]) |
|---|
| 576 | | | return false; |
|---|
| 577 | | | return true; |
|---|
| 578 | | | } |
|---|
| 579 | | | |
|---|
| 580 | | | function identifier() : string { |
|---|
| 581 | | | let name : string? = null; |
|---|
| 582 | | | - if (idx < source.length) { |
|---|
| 583 | | | + if (idx < slen) { |
|---|
| 584 | | | let c : string = source[idx++]; |
|---|
| 585 | | | if (!isIdentifierStart(c)) |
|---|
| 586 | | | fail( SyntaxError, "Expected identifier" ); |
|---|
| 587 | | | let name = c; |
|---|
| 588 | | | - while (idx < source.length && isIdentifierPart(source[idx])) |
|---|
| 589 | | | + while (idx < slen && isIdentifierPart(source[idx])) |
|---|
| 590 | | | name += source[idx++]; |
|---|
| 591 | | | skip(); |
|---|
| 592 | | | return name; |
|---|
| 593 | | | @@ -539,15 +637,25 @@ package RegExpInternals |
|---|
| 594 | | | } |
|---|
| 595 | | | |
|---|
| 596 | | | function atEnd() |
|---|
| 597 | | | - idx >= source.length; |
|---|
| 598 | | | + idx >= slen; |
|---|
| 599 | | | |
|---|
| 600 | | | function peekChar() { |
|---|
| 601 | | | - if (!atEnd()) |
|---|
| 602 | | | - return source[idx]; |
|---|
| 603 | | | + if (idx < slen) |
|---|
| 604 | | | + return source[idx] |
|---|
| 605 | | | else |
|---|
| 606 | | | return "*END*"; |
|---|
| 607 | | | } |
|---|
| 608 | | | |
|---|
| 609 | | | + function peekCharCode() { |
|---|
| 610 | | | + // In a production implementation, this would probably be |
|---|
| 611 | | | + // no faster than peekChar. In our reference |
|---|
| 612 | | | + // implementation, it is substantially faster. |
|---|
| 613 | | | + if (idx < slen) |
|---|
| 614 | | | + return magic::charCodeAt(source, uint(idx)); |
|---|
| 615 | | | + else |
|---|
| 616 | | | + return 0x0u; |
|---|
| 617 | | | + } |
|---|
| 618 | | | + |
|---|
| 619 | | | function consumeChar(c : string? = null) : string { |
|---|
| 620 | | | if (!atEnd() && (c === null || source[idx] == c)) |
|---|
| 621 | | | return source[idx++]; |
|---|
| 622 | | | @@ -557,6 +665,14 @@ package RegExpInternals |
|---|
| 623 | | | fail( SyntaxError, "Unexected EOF" ); |
|---|
| 624 | | | } |
|---|
| 625 | | | |
|---|
| 626 | | | + function advance() { |
|---|
| 627 | | | + if (idx + 1 > slen) |
|---|
| 628 | | | + fail( SyntaxError, "advancing beyond end of regexp"); |
|---|
| 629 | | | + idx++; |
|---|
| 630 | | | + if (extended) |
|---|
| 631 | | | + skip(); |
|---|
| 632 | | | + } |
|---|
| 633 | | | + |
|---|
| 634 | | | function consumeUntil(c : string) : void { |
|---|
| 635 | | | while (!atEnd() && source[idx] != c) |
|---|
| 636 | | | ++idx; |
|---|
| 637 | | | @@ -567,11 +683,13 @@ package RegExpInternals |
|---|
| 638 | | | return; |
|---|
| 639 | | | |
|---|
| 640 | | | while (!atEnd()) { |
|---|
| 641 | | | - if (source[idx] == '#') { |
|---|
| 642 | | | - while (!atEnd() && !isTerminator(source[idx])) |
|---|
| 643 | | | + let c : uint = peekCharCode(); |
|---|
| 644 | | | + if (c == 0x22u /* '#' */) { |
|---|
| 645 | | | + ++idx; |
|---|
| 646 | | | + while (!atEnd() && !isTerminator(peekCharCode())) |
|---|
| 647 | | | ++idx; |
|---|
| 648 | | | } |
|---|
| 649 | | | - else if (isBlank(source[idx]) || isTerminator(source[idx]) || isFormatControl(source[idx])) |
|---|
| 650 | | | + else if (isBlankCode(c) || isTerminatorCode(c) || isFormatControlCode(c)) |
|---|
| 651 | | | ++idx; |
|---|
| 652 | | | else |
|---|
| 653 | | | return; |
|---|