encoding unicode.ts

82% Statements 41/50
77.78% Branches 35/45
81.82% Functions 9/11
81.63% Lines 40/49
Press n or j to go to the next uncovered block, b, p or k for the previous block.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141  
6x
 
 
 
 
 
6x
 
1x
 
 
 
34x
 
34x
 
 
 
 
1x
 
 
 
33x
 
 
 
6x
 
1x
 
 
34x
 
34x
 
 
 
 
 
1x
 
 
 
1x
 
 
 
33x
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68x
68x
68x
68x
68x
68x
 
68x
 
 
 
68x
 
 
 
68x
30744x
 
30744x
29384x
 
1360x
 
 
 
 
 
68x
 
68x
 
68x
2x
66x
 
66x
 
 
 
 
 
68x
 
 
 
6x
 
1x
 
 
15406x
 
 
 
 
 
 
 
 
6x
 
1x
 
 
 
15406x
 
 
 
 
 
 
 
  import { Context, Recogniser } from '.';
const match = require('../match').default;
 
/**
 * This class matches UTF-16 and UTF-32, both big- and little-endian. The
 * BOM will be used if it is present.
 */
export class UTF_16BE implements Recogniser {
  name() {
    return 'UTF-16BE';
  }
 
  match(det: Context) {
    var input = det.fRawInput;
 
    if (
      input.length >= 2 &&
      (input[0] & 0xff) == 0xfe &&
      (input[1] & 0xff) == 0xff
    ) {
      return match(det, this, 100); // confidence = 100
    }
 
    // TODO: Do some statistics to check for unsigned UTF-16BE
    return null;
  }
}
 
export class UTF_16LE implements Recogniser {
  name() {
    return 'UTF-16LE';
  }
  match(det: Context) {
    var input = det.fRawInput;
 
    if (
      input.length >= 2 &&
      (input[0] & 0xff) == 0xff &&
      (input[1] & 0xff) == 0xfe
    ) {
      // LE BOM is present.
      Iif (input.length >= 4 && input[2] == 0x00 && input[3] == 0x00) {
        // It is probably UTF-32 LE, not UTF-16
        return null;
      }
      return match(det, this, 100); // confidence = 100
    }
 
    // TODO: Do some statistics to check for unsigned UTF-16LE
    return null;
  }
}
 
interface WithGetChar {
  getChar(input: Uint8Array, index: number): number;
}
 
class UTF_32 implements Recogniser, WithGetChar {
  name() {
    return 'UTF-32';
  }
 
  getChar(input: Uint8Array, index: number): number {
    return -1;
  }
 
  match(det: Context) {
    var input = det.fRawInput,
      limit = (det.fRawLength / 4) * 4,
      numValid = 0,
      numInvalid = 0,
      hasBOM = false,
      confidence = 0;
 
    Iif (limit == 0) {
      return null;
    }
 
    Iif (this.getChar(input, 0) == 0x0000feff) {
      hasBOM = true;
    }
 
    for (var i = 0; i < limit; i += 4) {
      var ch = this.getChar(input, i);
 
      if (ch < 0 || ch >= 0x10ffff || (ch >= 0xd800 && ch <= 0xdfff)) {
        numInvalid += 1;
      } else {
        numValid += 1;
      }
    }
 
    // Cook up some sort of confidence score, based on presence of a BOM
    //    and the existence of valid and/or invalid multi-byte sequences.
    Iif (hasBOM && numInvalid == 0) {
      confidence = 100;
    } else Iif (hasBOM && numValid > numInvalid * 10) {
      confidence = 80;
    } else if (numValid > 3 && numInvalid == 0) {
      confidence = 100;
    } else Iif (numValid > 0 && numInvalid == 0) {
      confidence = 80;
    } else Iif (numValid > numInvalid * 10) {
      // Probably corrupt UTF-32BE data.  Valid sequences aren't likely by chance.
      confidence = 25;
    }
 
    // return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
    return confidence == 0 ? null : match(det, this, confidence);
  }
}
 
export class UTF_32BE extends UTF_32 {
  name() {
    return 'UTF-32BE';
  }
  getChar(input: Uint8Array, index: number) {
    return (
      ((input[index + 0] & 0xff) << 24) |
      ((input[index + 1] & 0xff) << 16) |
      ((input[index + 2] & 0xff) << 8) |
      (input[index + 3] & 0xff)
    );
  }
}
 
export class UTF_32LE extends UTF_32 {
  name() {
    return 'UTF-32LE';
  }
 
  getChar(input: Uint8Array, index: number) {
    return (
      ((input[index + 3] & 0xff) << 24) |
      ((input[index + 2] & 0xff) << 16) |
      ((input[index + 1] & 0xff) << 8) |
      (input[index + 0] & 0xff)
    );
  }
}