All files / src/encoding utf8.ts

90% Statements 45/50
79.55% Branches 35/44
100% Functions 2/2
95.24% Lines 40/42

Press n or j to go to the next uncovered block, b, p or k for the previous block.

1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79    6x   6x   11x       34x 34x 34x 34x 34x     34x                   34x 19610x 19610x     3272x 2112x 1160x 1067x 93x 27x   66x 66x 43x       3249x 3860x 3860x   3858x 1538x 1538x   2320x 1709x 1709x             34x 34x 34x 34x 27x 27x   4x 23x     23x   11x      
import { Context, Recogniser } from '.';
 
var match = require('../match').default;
 
export default class Utf8 implements Recogniser {
  name() {
    return 'UTF-8';
  }
 
  match(det: Context) {
    var hasBOM = false,
      numValid = 0,
      numInvalid = 0,
      input = det.fRawInput,
      trailBytes = 0,
      confidence;
 
    Iif (
      det.fRawLength >= 3 &&
      (input[0] & 0xff) == 0xef &&
      (input[1] & 0xff) == 0xbb &&
      (input[2] & 0xff) == 0xbf
    ) {
      hasBOM = true;
    }
 
    // Scan for multi-byte sequences
    for (var i = 0; i < det.fRawLength; i++) {
      var b = input[i];
      if ((b & 0x80) == 0) continue; // ASCII
 
      // Hi bit on char found.  Figure out how long the sequence should be
      if ((b & 0x0e0) == 0x0c0) {
        trailBytes = 1;
      } else if ((b & 0x0f0) == 0x0e0) {
        trailBytes = 2;
      } else if ((b & 0x0f8) == 0xf0) {
        trailBytes = 3;
      } else {
        numInvalid++;
        if (numInvalid > 5) break;
        trailBytes = 0;
      }
 
      // Verify that we've got the right number of trail bytes in the sequence
      for (;;) {
        i++;
        if (i >= det.fRawLength) break;
 
        if ((input[i] & 0xc0) != 0x080) {
          numInvalid++;
          break;
        }
        if (--trailBytes == 0) {
          numValid++;
          break;
        }
      }
    }
 
    // Cook up some sort of confidence score, based on presense of a BOM
    //    and the existence of valid and/or invalid multi-byte sequences.
    confidence = 0;
    Iif (hasBOM && numInvalid == 0) confidence = 100;
    else Iif (hasBOM && numValid > numInvalid * 10) confidence = 80;
    else if (numValid > 3 && numInvalid == 0) confidence = 100;
    else Iif (numValid > 0 && numInvalid == 0) confidence = 80;
    else if (numValid == 0 && numInvalid == 0)
      // Plain ASCII.
      confidence = 10;
    else Iif (numValid > numInvalid * 10)
      // Probably corrupt utf-8 data.  Valid sequences aren't likely by chance.
      confidence = 25;
    else return null;
 
    return match(det, this, confidence);
  }
}