All files / src/encoding iso2022.ts

92.5% Statements 37/40
81.25% Branches 13/16
88.89% Functions 8/9
96.97% Lines 32/33

Press n or j to go to the next uncovered block, b, p or k for the previous block.

1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133    6x                 18x                                           102x 102x 102x       102x 102x   102x 184194x 156x         952x   952x   952x 1464x   52x 52x 52x     104x       184142x     102x               3x         3x   3x       6x   1x   6x                               6x   1x   6x     6x   1x   6x                            
import { Context, Recogniser } from '.';
 
var match = require('../match').default;
 
/**
 * This is a superclass for the individual detectors for
 * each of the detectable members of the ISO 2022 family
 * of encodings.
 */
 
class ISO_2022 implements Recogniser {
  escapeSequences: number[][] = [];
 
  name() {
    return 'ISO_2022';
  }
 
  match(det: Context) {
    /**
     * Matching function shared among the 2022 detectors JP, CN and KR
     * Counts up the number of legal an unrecognized escape sequences in
     * the sample of text, and computes a score based on the total number &
     * the proportion that fit the encoding.
     *
     *
     * @param text the byte buffer containing text to analyse
     * @param textLen  the size of the text in the byte.
     * @param escapeSequences the byte escape sequences to test for.
     * @return match quality, in the range of 0-100.
     */
 
    var i, j;
    var escN;
    var hits = 0;
    var misses = 0;
    var shifts = 0;
    var quality;
 
    // TODO: refactor me
    var text = det.fInputBytes;
    var textLen = det.fInputLen;
 
    scanInput: for (i = 0; i < textLen; i++) {
      if (text[i] == 0x1b) {
        checkEscapes: for (
          escN = 0;
          escN < this.escapeSequences.length;
          escN++
        ) {
          var seq = this.escapeSequences[escN];
 
          Iif (textLen - i < seq.length) continue checkEscapes;
 
          for (j = 1; j < seq.length; j++)
            if (seq[j] != text[i + j]) continue checkEscapes;
 
          hits++;
          i += seq.length - 1;
          continue scanInput;
        }
 
        misses++;
      }
 
      // Shift in/out
      if (text[i] == 0x0e || text[i] == 0x0f) shifts++;
    }
 
    if (hits == 0) return null;
 
    //
    // Initial quality is based on relative proportion of recognized vs.
    //   unrecognized escape sequences.
    //   All good:  quality = 100;
    //   half or less good: quality = 0;
    //   linear in between.
    quality = (100 * hits - 100 * misses) / (hits + misses);
 
    // Back off quality if there were too few escape sequences seen.
    //   Include shifts in this computation, so that KR does not get penalized
    //   for having only a single Escape sequence, but many shifts.
    Iif (hits + shifts < 5) quality -= (5 - (hits + shifts)) * 10;
 
    return quality <= 0 ? null : match(det, this, quality);
  }
}
 
export class ISO_2022_JP extends ISO_2022 {
  name() {
    return 'ISO-2022-JP';
  }
  escapeSequences = [
    [0x1b, 0x24, 0x28, 0x43], // KS X 1001:1992
    [0x1b, 0x24, 0x28, 0x44], // JIS X 212-1990
    [0x1b, 0x24, 0x40], // JIS C 6226-1978
    [0x1b, 0x24, 0x41], // GB 2312-80
    [0x1b, 0x24, 0x42], // JIS X 208-1983
    [0x1b, 0x26, 0x40], // JIS X 208 1990, 1997
    [0x1b, 0x28, 0x42], // ASCII
    [0x1b, 0x28, 0x48], // JIS-Roman
    [0x1b, 0x28, 0x49], // Half-width katakana
    [0x1b, 0x28, 0x4a], // JIS-Roman
    [0x1b, 0x2e, 0x41], // ISO 8859-1
    [0x1b, 0x2e, 0x46], // ISO 8859-7
  ];
}
 
export class ISO_2022_KR extends ISO_2022 {
  name() {
    return 'ISO-2022-KR';
  }
  escapeSequences = [[0x1b, 0x24, 0x29, 0x43]];
}
 
export class ISO_2022_CN extends ISO_2022 {
  name() {
    return 'ISO-2022-CN';
  }
  escapeSequences = [
    [0x1b, 0x24, 0x29, 0x41], // GB 2312-80
    [0x1b, 0x24, 0x29, 0x47], // CNS 11643-1992 Plane 1
    [0x1b, 0x24, 0x2a, 0x48], // CNS 11643-1992 Plane 2
    [0x1b, 0x24, 0x29, 0x45], // ISO-IR-165
    [0x1b, 0x24, 0x2b, 0x49], // CNS 11643-1992 Plane 3
    [0x1b, 0x24, 0x2b, 0x4a], // CNS 11643-1992 Plane 4
    [0x1b, 0x24, 0x2b, 0x4b], // CNS 11643-1992 Plane 5
    [0x1b, 0x24, 0x2b, 0x4c], // CNS 11643-1992 Plane 6
    [0x1b, 0x24, 0x2b, 0x4d], // CNS 11643-1992 Plane 7
    [0x1b, 0x4e], // SS2
    [0x1b, 0x4f], // SS3
  ];
}