Press n or j to go to the next uncovered block, b, p or k for the previous block.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 | 6x 6x 11x 34x 34x 34x 34x 34x 34x 34x 19610x 19610x 3272x 2112x 1160x 1067x 93x 27x 66x 66x 43x 3249x 3860x 3860x 3858x 1538x 1538x 2320x 1709x 1709x 34x 34x 34x 34x 27x 27x 4x 23x 23x 11x | import { Context, Recogniser } from '.'; var match = require('../match').default; export default class Utf8 implements Recogniser { name() { return 'UTF-8'; } match(det: Context) { var hasBOM = false, numValid = 0, numInvalid = 0, input = det.fRawInput, trailBytes = 0, confidence; Iif ( det.fRawLength >= 3 && (input[0] & 0xff) == 0xef && (input[1] & 0xff) == 0xbb && (input[2] & 0xff) == 0xbf ) { hasBOM = true; } // Scan for multi-byte sequences for (var i = 0; i < det.fRawLength; i++) { var b = input[i]; if ((b & 0x80) == 0) continue; // ASCII // Hi bit on char found. Figure out how long the sequence should be if ((b & 0x0e0) == 0x0c0) { trailBytes = 1; } else if ((b & 0x0f0) == 0x0e0) { trailBytes = 2; } else if ((b & 0x0f8) == 0xf0) { trailBytes = 3; } else { numInvalid++; if (numInvalid > 5) break; trailBytes = 0; } // Verify that we've got the right number of trail bytes in the sequence for (;;) { i++; if (i >= det.fRawLength) break; if ((input[i] & 0xc0) != 0x080) { numInvalid++; break; } if (--trailBytes == 0) { numValid++; break; } } } // Cook up some sort of confidence score, based on presense of a BOM // and the existence of valid and/or invalid multi-byte sequences. confidence = 0; Iif (hasBOM && numInvalid == 0) confidence = 100; else Iif (hasBOM && numValid > numInvalid * 10) confidence = 80; else if (numValid > 3 && numInvalid == 0) confidence = 100; else Iif (numValid > 0 && numInvalid == 0) confidence = 80; else if (numValid == 0 && numInvalid == 0) // Plain ASCII. confidence = 10; else Iif (numValid > numInvalid * 10) // Probably corrupt utf-8 data. Valid sequences aren't likely by chance. confidence = 25; else return null; return match(det, this, confidence); } } |