Classes in this File | Line Coverage | Branch Coverage | Complexity | ||||
RFC4627AwareInputStreamReader |
|
| 7.75;7,75 |
1 | /* | |
2 | * Licensed to the Apache Software Foundation (ASF) under one | |
3 | * or more contributor license agreements. See the NOTICE file | |
4 | * distributed with this work for additional information | |
5 | * regarding copyright ownership. The ASF licenses this file | |
6 | * to you under the Apache License, Version 2.0 (the | |
7 | * "License"); you may not use this file except in compliance | |
8 | * with the License. You may obtain a copy of the License at | |
9 | * | |
10 | * http://www.apache.org/licenses/LICENSE-2.0 | |
11 | * | |
12 | * Unless required by applicable law or agreed to in writing, | |
13 | * software distributed under the License is distributed on an | |
14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | |
15 | * KIND, either express or implied. See the License for the | |
16 | * specific language governing permissions and limitations | |
17 | * under the License. | |
18 | */ | |
19 | package org.apache.johnzon.core; | |
20 | ||
21 | import java.io.IOException; | |
22 | import java.io.InputStream; | |
23 | import java.io.InputStreamReader; | |
24 | import java.io.PushbackInputStream; | |
25 | import java.nio.charset.Charset; | |
26 | ||
27 | import javax.json.JsonException; | |
28 | ||
29 | final class RFC4627AwareInputStreamReader extends InputStreamReader { | |
30 | ||
31 | RFC4627AwareInputStreamReader(final InputStream in) { | |
32 | 278 | this(new PushbackInputStream(in,4)); |
33 | 275 | } |
34 | ||
35 | private RFC4627AwareInputStreamReader(final PushbackInputStream in) { | |
36 | 278 | super(in, getCharset(in).newDecoder()); |
37 | ||
38 | 275 | } |
39 | ||
40 | /** | |
41 | * According to the Java API "An attempt is made to read as many as len bytes, but a smaller number may be read". | |
42 | * [http://docs.oracle.com/javase/7/docs/api/java/io/InputStream.html#read(byte[],%20int,%20int)] | |
43 | * For this reason we need to ensure that we've read all the bytes that we need out of this stream. | |
44 | */ | |
45 | private static byte[] readAllBytes(final PushbackInputStream inputStream) throws IOException { | |
46 | 278 | final int first = inputStream.read(); |
47 | 277 | final int second = inputStream.read(); |
48 | 277 | if(first == -1|| second == -1) { |
49 | 2 | throw new JsonException("Invalid Json. Valid Json has at least 2 bytes"); |
50 | } | |
51 | 275 | final int third = inputStream.read(); |
52 | 275 | final int fourth = inputStream.read(); |
53 | 275 | if(third == -1) { |
54 | 2 | return new byte[] { (byte) first, (byte) second }; |
55 | 273 | } else if(fourth == -1) { |
56 | 1 | return new byte[] { (byte) first, (byte) second, (byte) third }; |
57 | } else { | |
58 | 272 | return new byte[] { (byte) first, (byte) second, (byte) third, (byte) fourth }; |
59 | } | |
60 | } | |
61 | ||
62 | /* | |
63 | * RFC 4627 | |
64 | ||
65 | JSON text SHALL be encoded in Unicode. The default encoding is | |
66 | UTF-8. | |
67 | | |
68 | Since the first two characters of a JSON text will always be ASCII | |
69 | characters [RFC0020], it is possible to determine whether an octet | |
70 | stream is UTF-8, UTF-16 (BE or LE), or UTF-32 (BE or LE) by looking | |
71 | at the pattern of nulls in the first four octets. | |
72 | ||
73 | 00 00 00 xx UTF-32BE | |
74 | 00 xx 00 xx UTF-16BE | |
75 | xx 00 00 00 UTF-32LE | |
76 | xx 00 xx 00 UTF-16LE | |
77 | xx xx xx xx UTF-8 | |
78 | ||
79 | */ | |
80 | ||
81 | private static Charset getCharset(final PushbackInputStream inputStream) { | |
82 | 278 | Charset charset = Charset.forName("UTF-8"); |
83 | 278 | int bomLength=0; |
84 | try { | |
85 | 278 | final byte[] utfBytes = readAllBytes(inputStream); |
86 | 275 | int first = (utfBytes[0] & 0xFF); |
87 | 275 | int second = (utfBytes[1] & 0xFF); |
88 | 275 | if (first == 0x00) { |
89 | 4 | charset = (second == 0x00) ? Charset.forName("UTF-32BE") : Charset.forName("UTF-16BE"); |
90 | 271 | } else if (utfBytes.length > 2 && second == 0x00) { |
91 | 3 | int third = (utfBytes[2] & 0xFF); |
92 | 3 | charset = (third == 0x00) ? Charset.forName("UTF-32LE") : Charset.forName("UTF-16LE"); |
93 | 3 | } else { |
94 | ||
95 | /*check BOM | |
96 | ||
97 | Encoding hex byte order mark | |
98 | UTF-8 EF BB BF | |
99 | UTF-16 (BE) FE FF | |
100 | UTF-16 (LE) FF FE | |
101 | UTF-32 (BE) 00 00 FE FF | |
102 | UTF-32 (LE) FF FE 00 00 | |
103 | */ | |
104 | ||
105 | //We do not check for UTF-32BE because that is already covered above and we | |
106 | //do not to unread anything. | |
107 | ||
108 | 268 | if(first == 0xFE && second == 0xFF) { |
109 | 2 | charset = Charset.forName("UTF-16BE"); |
110 | 2 | bomLength=2; |
111 | 266 | } else if(first == 0xFF && second == 0xFE) { |
112 | 3 | if(utfBytes.length > 3 && (utfBytes[2]&0xff) == 0x00 && (utfBytes[3]&0xff) == 0x00) { |
113 | 2 | charset = Charset.forName("UTF-32LE"); |
114 | 2 | bomLength=4; |
115 | }else { | |
116 | 1 | charset = Charset.forName("UTF-16LE"); |
117 | 1 | bomLength=2; |
118 | } | |
119 | 263 | } else if (utfBytes.length > 2 && first == 0xEF && second == 0xBB && (utfBytes[2]&0xff) == 0xBF) { |
120 | //UTF-8 with BOM | |
121 | 2 | bomLength=3; |
122 | } | |
123 | } | |
124 | //assume UTF8 | |
125 | 275 | if(bomLength > 0 && bomLength < 4) { |
126 | //do not unread BOM, only bytes after BOM | |
127 | 5 | inputStream.unread(utfBytes,bomLength,utfBytes.length - bomLength); |
128 | } else { | |
129 | //no BOM, unread all read bytes | |
130 | 270 | inputStream.unread(utfBytes); |
131 | } | |
132 | ||
133 | ||
134 | 1 | } catch (final IOException e) { |
135 | 1 | throw new JsonException("Unable to detect charset due to "+e.getMessage(), e); |
136 | 275 | } |
137 | ||
138 | 275 | return charset; |
139 | } | |
140 | ||
141 | } |