All files / src/encoding unicode.ts

82% Statements 41/50
77.78% Branches 35/45
81.82% Functions 9/11
81.63% Lines 40/49

Press n or j to go to the next uncovered block, b, p or k for the previous block.

1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141  6x           6x   1x       34x   34x         1x       33x       6x   1x     34x   34x           1x       1x       33x                                   68x 68x 68x 68x 68x 68x   68x       68x       68x 30744x   30744x 29384x   1360x           68x   68x   68x 2x 66x   66x           68x       6x   1x     15406x                 6x   1x       15406x                
import { Context, Recogniser } from '.';
const match = require('../match').default;
 
/**
 * This class matches UTF-16 and UTF-32, both big- and little-endian. The
 * BOM will be used if it is present.
 */
export class UTF_16BE implements Recogniser {
  name() {
    return 'UTF-16BE';
  }
 
  match(det: Context) {
    var input = det.fRawInput;
 
    if (
      input.length >= 2 &&
      (input[0] & 0xff) == 0xfe &&
      (input[1] & 0xff) == 0xff
    ) {
      return match(det, this, 100); // confidence = 100
    }
 
    // TODO: Do some statistics to check for unsigned UTF-16BE
    return null;
  }
}
 
export class UTF_16LE implements Recogniser {
  name() {
    return 'UTF-16LE';
  }
  match(det: Context) {
    var input = det.fRawInput;
 
    if (
      input.length >= 2 &&
      (input[0] & 0xff) == 0xff &&
      (input[1] & 0xff) == 0xfe
    ) {
      // LE BOM is present.
      Iif (input.length >= 4 && input[2] == 0x00 && input[3] == 0x00) {
        // It is probably UTF-32 LE, not UTF-16
        return null;
      }
      return match(det, this, 100); // confidence = 100
    }
 
    // TODO: Do some statistics to check for unsigned UTF-16LE
    return null;
  }
}
 
interface WithGetChar {
  getChar(input: Uint8Array, index: number): number;
}
 
class UTF_32 implements Recogniser, WithGetChar {
  name() {
    return 'UTF-32';
  }
 
  getChar(input: Uint8Array, index: number): number {
    return -1;
  }
 
  match(det: Context) {
    var input = det.fRawInput,
      limit = (det.fRawLength / 4) * 4,
      numValid = 0,
      numInvalid = 0,
      hasBOM = false,
      confidence = 0;
 
    Iif (limit == 0) {
      return null;
    }
 
    Iif (this.getChar(input, 0) == 0x0000feff) {
      hasBOM = true;
    }
 
    for (var i = 0; i < limit; i += 4) {
      var ch = this.getChar(input, i);
 
      if (ch < 0 || ch >= 0x10ffff || (ch >= 0xd800 && ch <= 0xdfff)) {
        numInvalid += 1;
      } else {
        numValid += 1;
      }
    }
 
    // Cook up some sort of confidence score, based on presence of a BOM
    //    and the existence of valid and/or invalid multi-byte sequences.
    Iif (hasBOM && numInvalid == 0) {
      confidence = 100;
    } else Iif (hasBOM && numValid > numInvalid * 10) {
      confidence = 80;
    } else if (numValid > 3 && numInvalid == 0) {
      confidence = 100;
    } else Iif (numValid > 0 && numInvalid == 0) {
      confidence = 80;
    } else Iif (numValid > numInvalid * 10) {
      // Probably corrupt UTF-32BE data.  Valid sequences aren't likely by chance.
      confidence = 25;
    }
 
    // return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
    return confidence == 0 ? null : match(det, this, confidence);
  }
}
 
export class UTF_32BE extends UTF_32 {
  name() {
    return 'UTF-32BE';
  }
  getChar(input: Uint8Array, index: number) {
    return (
      ((input[index + 0] & 0xff) << 24) |
      ((input[index + 1] & 0xff) << 16) |
      ((input[index + 2] & 0xff) << 8) |
      (input[index + 3] & 0xff)
    );
  }
}
 
export class UTF_32LE extends UTF_32 {
  name() {
    return 'UTF-32LE';
  }
 
  getChar(input: Uint8Array, index: number) {
    return (
      ((input[index + 3] & 0xff) << 24) |
      ((input[index + 2] & 0xff) << 16) |
      ((input[index + 1] & 0xff) << 8) |
      (input[index + 0] & 0xff)
    );
  }
}