encoding utf8.ts

90% Statements 45/50
79.55% Branches 35/44
100% Functions 2/2
95.24% Lines 40/42
Press n or j to go to the next uncovered block, b, p or k for the previous block.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79  
 
6x
 
6x
 
11x
 
 
 
34x
34x
34x
34x
34x
 
 
34x
 
 
 
 
 
 
 
 
 
34x
19610x
19610x
 
 
3272x
2112x
1160x
1067x
93x
27x
 
66x
66x
43x
 
 
 
3249x
3860x
3860x
 
3858x
1538x
1538x
 
2320x
1709x
1709x
 
 
 
 
 
 
34x
34x
34x
34x
27x
27x
 
4x
23x
 
 
23x
 
11x
 
 
  import { Context, Recogniser } from '.';
 
var match = require('../match').default;
 
export default class Utf8 implements Recogniser {
  name() {
    return 'UTF-8';
  }
 
  match(det: Context) {
    var hasBOM = false,
      numValid = 0,
      numInvalid = 0,
      input = det.fRawInput,
      trailBytes = 0,
      confidence;
 
    Iif (
      det.fRawLength >= 3 &&
      (input[0] & 0xff) == 0xef &&
      (input[1] & 0xff) == 0xbb &&
      (input[2] & 0xff) == 0xbf
    ) {
      hasBOM = true;
    }
 
    // Scan for multi-byte sequences
    for (var i = 0; i < det.fRawLength; i++) {
      var b = input[i];
      if ((b & 0x80) == 0) continue; // ASCII
 
      // Hi bit on char found.  Figure out how long the sequence should be
      if ((b & 0x0e0) == 0x0c0) {
        trailBytes = 1;
      } else if ((b & 0x0f0) == 0x0e0) {
        trailBytes = 2;
      } else if ((b & 0x0f8) == 0xf0) {
        trailBytes = 3;
      } else {
        numInvalid++;
        if (numInvalid > 5) break;
        trailBytes = 0;
      }
 
      // Verify that we've got the right number of trail bytes in the sequence
      for (;;) {
        i++;
        if (i >= det.fRawLength) break;
 
        if ((input[i] & 0xc0) != 0x080) {
          numInvalid++;
          break;
        }
        if (--trailBytes == 0) {
          numValid++;
          break;
        }
      }
    }
 
    // Cook up some sort of confidence score, based on presense of a BOM
    //    and the existence of valid and/or invalid multi-byte sequences.
    confidence = 0;
    Iif (hasBOM && numInvalid == 0) confidence = 100;
    else Iif (hasBOM && numValid > numInvalid * 10) confidence = 80;
    else if (numValid > 3 && numInvalid == 0) confidence = 100;
    else Iif (numValid > 0 && numInvalid == 0) confidence = 80;
    else if (numValid == 0 && numInvalid == 0)
      // Plain ASCII.
      confidence = 10;
    else Iif (numValid > numInvalid * 10)
      // Probably corrupt utf-8 data.  Valid sequences aren't likely by chance.
      confidence = 25;
    else return null;
 
    return match(det, this, confidence);
  }
}