Ticket #6151: EncodingInfo.java

File EncodingInfo.java, 5.4 kB (added by hsivonen@..., 1 year ago)

Test program that triggers the crash

Line 
1 /*
2  * Copyright (c) 2006 Henri Sivonen
3  * Copyright (c) 2008 Mozilla Foundation
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining a
6  * copy of this software and associated documentation files (the "Software"),
7  * to deal in the Software without restriction, including without limitation
8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9  * and/or sell copies of the Software, and to permit persons to whom the
10  * Software is furnished to do so, subject to the following conditions:
11  *
12  * The above copyright notice and this permission notice shall be included in
13  * all copies or substantial portions of the Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21  * DEALINGS IN THE SOFTWARE.
22  */
23
24 package nu.validator.htmlparser.impl;
25
26 import java.io.ByteArrayInputStream;
27 import java.io.IOException;
28 import java.io.InputStreamReader;
29 import java.io.Reader;
30 import java.nio.charset.Charset;
31 import java.nio.charset.CharsetDecoder;
32 import java.nio.charset.CodingErrorAction;
33 import java.util.Arrays;
34 import java.util.Iterator;
35 import java.util.Map;
36 import java.util.SortedMap;
37 import java.util.SortedSet;
38 import java.util.TreeSet;
39
40 public class EncodingInfo {
41
42     private static String[] NOT_OBSCURE = {"big5",
43         "big5-hkscs",
44         "euc-jp",
45         "euc-kr",
46         "gb18030",
47         "gbk",
48         "iso-2022-jp",
49         "iso-2022-kr",
50         "iso-8859-1",
51         "iso-8859-13",
52         "iso-8859-15",
53         "iso-8859-2",
54         "iso-8859-3",
55         "iso-8859-4",
56         "iso-8859-5",
57         "iso-8859-6",
58         "iso-8859-7",
59         "iso-8859-8",
60         "iso-8859-9",
61         "koi8-r",
62         "shift_jis",
63         "tis-620",
64         "us-ascii",
65         "utf-16",
66         "utf-16be",
67         "utf-16le",
68         "utf-8",
69         "windows-1250",
70         "windows-1251",
71         "windows-1252",
72         "windows-1253",
73         "windows-1254",
74         "windows-1255",
75         "windows-1256",
76         "windows-1257",
77         "windows-1258"};
78    
79     private static String[] asciiSuperset;
80
81     private static String[] notAsciiSuperset;   
82
83     static {
84         byte[] testBuf = new byte[0x63];
85         for (int i = 0; i < 0x60; i++) {
86             testBuf[i] = (byte) (i + 0x20);
87         }
88         testBuf[0x60] = (byte) '\n';
89         testBuf[0x61] = (byte) '\r';
90         testBuf[0x62] = (byte) '\t';
91
92         SortedSet<String> asciiSupersetSet = new TreeSet<String>();
93         SortedSet<String> notAsciiSupersetSet = new TreeSet<String>();
94        
95         SortedMap charsets = Charset.availableCharsets();
96         for (Iterator iter = charsets.entrySet().iterator(); iter.hasNext();) {
97             Map.Entry entry = (Map.Entry) iter.next();
98             Charset cs = (Charset) entry.getValue();
99             if (asciiMapsToBasicLatin(testBuf, cs)) {
100                 asciiSupersetSet.add(cs.name().intern());
101             } else {
102                 notAsciiSupersetSet.add(cs.name().intern());
103             }
104         }
105        
106         asciiSuperset = (String[]) asciiSupersetSet.toArray(new String[0]);
107         notAsciiSuperset = (String[]) notAsciiSupersetSet.toArray(new String[0]);
108     }
109
110     public static boolean isAsciiSuperset(String preferredIanaName) {
111         return (Arrays.binarySearch(asciiSuperset, preferredIanaName) > -1);
112     }
113
114     public static boolean isNotAsciiSuperset(String preferredIanaName) {
115         return (Arrays.binarySearch(notAsciiSuperset, preferredIanaName) > -1);
116     }
117
118     public static boolean isObscure(String preferredIanaName) {
119         // XXX Turkish i
120         return !(Arrays.binarySearch(NOT_OBSCURE, preferredIanaName.toLowerCase()) > -1);
121     }
122    
123     /**
124      * @param testBuf
125      * @param cs
126      */
127     private static boolean asciiMapsToBasicLatin(byte[] testBuf, Charset cs) {
128         CharsetDecoder dec = cs.newDecoder();
129         dec.onMalformedInput(CodingErrorAction.REPORT);
130         dec.onUnmappableCharacter(CodingErrorAction.REPORT);
131         Reader r = new InputStreamReader(new ByteArrayInputStream(testBuf), dec);
132         try {
133             for (int i = 0; i < 0x60; i++) {
134                 if ((i + 0x20) != r.read()) {
135                     return false;
136                 }
137             }
138             if ('\n' != r.read()) {
139                 return false;
140             }
141             if ('\r' != r.read()) {
142                 return false;
143             }
144             if ('\t' != r.read()) {
145                 return false;
146             }       
147         } catch (IOException e) {
148             return false;
149         } catch (Exception e) {
150             return false;
151         }
152
153         return true;
154     }
155
156     public static void main(String[] args) {
157         System.out.println("ASCII maps to Basic Latin:");
158         for (int i = 0; i < asciiSuperset.length; i++) {
159             System.out.println(asciiSuperset[i]);           
160         }
161         System.out.println();
162         System.out.println("ASCII does not map to Basic Latin:");
163         for (int i = 0; i < notAsciiSuperset.length; i++) {
164             System.out.println(notAsciiSuperset[i]);           
165         }
166     }
167 }