aboutsummaryrefslogtreecommitdiff
path: root/src/main/java/org/yaml/snakeyaml/reader/UnicodeReader.java
blob: 4c9c9039ade2d4e255853eae428f7218ba459ed5 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
/**
 * Copyright (c) 2008, SnakeYAML
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 * in compliance with the License. You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software distributed under the License
 * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
 * or implied. See the License for the specific language governing permissions and limitations under
 * the License.
 */
package org.yaml.snakeyaml.reader;

/**
 * version: 1.1 / 2007-01-25 - changed BOM recognition ordering (longer boms first)
 *
 * Original pseudocode : Thomas Weidenfeller Implementation tweaked: Aki Nieminen Implementation
 * changed: Andrey Somov UTF-32 removed because it is not supported by YAML no default encoding
 *
 * http://www.unicode.org/unicode/faq/utf_bom.html BOMs: 00 00 FE FF = UTF-32, big-endian FF FE 00
 * 00 = UTF-32, little-endian EF BB BF = UTF-8, FE FF = UTF-16, big-endian FF FE = UTF-16,
 * little-endian
 *
 * Win2k Notepad: Unicode format = UTF-16LE
 ***/

import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.PushbackInputStream;
import java.io.Reader;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CodingErrorAction;
import java.nio.charset.StandardCharsets;

/**
 * Generic unicode textreader, which will use BOM mark to identify the encoding to be used. If BOM
 * is not found then use a given default or system encoding.
 */
public class UnicodeReader extends Reader {

  private static final Charset UTF8 = StandardCharsets.UTF_8;
  private static final Charset UTF16BE = StandardCharsets.UTF_16BE;
  private static final Charset UTF16LE = StandardCharsets.UTF_16LE;

  PushbackInputStream internalIn;
  InputStreamReader internalIn2 = null;

  private static final int BOM_SIZE = 3;

  /**
   * @param in InputStream to be read
   */
  public UnicodeReader(InputStream in) {
    internalIn = new PushbackInputStream(in, BOM_SIZE);
  }

  /**
   * Get stream encoding or NULL if stream is uninitialized. Call init() or read() method to
   * initialize it.
   *
   * @return the name of the character encoding being used by this stream.
   */
  public String getEncoding() {
    return internalIn2.getEncoding();
  }

  /**
   * Read-ahead four bytes and check for BOM marks. Extra bytes are unread back to the stream, only
   * BOM bytes are skipped.
   *
   * @throws IOException if InputStream cannot be created
   */
  protected void init() throws IOException {
    if (internalIn2 != null) {
      return;
    }

    Charset encoding;
    byte[] bom = new byte[BOM_SIZE];
    int n, unread;
    n = internalIn.read(bom, 0, bom.length);

    if ((bom[0] == (byte) 0xEF) && (bom[1] == (byte) 0xBB) && (bom[2] == (byte) 0xBF)) {
      encoding = UTF8;
      unread = n - 3;
    } else if ((bom[0] == (byte) 0xFE) && (bom[1] == (byte) 0xFF)) {
      encoding = UTF16BE;
      unread = n - 2;
    } else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE)) {
      encoding = UTF16LE;
      unread = n - 2;
    } else {
      // Unicode BOM mark not found, unread all bytes
      encoding = UTF8;
      unread = n;
    }

    if (unread > 0) {
      internalIn.unread(bom, (n - unread), unread);
    }

    // Use given encoding
    CharsetDecoder decoder = encoding.newDecoder().onUnmappableCharacter(CodingErrorAction.REPORT);
    internalIn2 = new InputStreamReader(internalIn, decoder);
  }

  public void close() throws IOException {
    init();
    internalIn2.close();
  }

  public int read(char[] cbuf, int off, int len) throws IOException {
    init();
    return internalIn2.read(cbuf, off, len);
  }
}