Source code and example code online: Sniffed Xml Reader

  
 /*   Copyright 2004 The Apache Software Foundation
  *
  *   Licensed under the Apache License, Version 2.0 (the "License");
  *   you may not use this file except in compliance with the License.
  *   You may obtain a copy of the License at
  *
  *       http://www.apache.org/licenses/LICENSE-2.0
  *
  *   Unless required by applicable law or agreed to in writing, software
  *   distributed under the License is distributed on an "AS IS" BASIS,
  *   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  *   See the License for the specific language governing permissions and
  *  limitations under the License.
  */
 
 // revised from xml beans
 import java.io.BufferedInputStream;
 import java.io.BufferedReader;
 import java.io.ByteArrayInputStream;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.InputStreamReader;
 import java.io.Reader;
 import java.nio.charset.Charset;
 
 public class SniffedXmlReader extends BufferedReader {
   // We don't sniff more than 192 bytes.
   public static int MAX_SNIFFED_CHARS = 192;
 
   public SniffedXmlReader(Reader reader) throws IOException {
     super(reader);
     _encoding = sniffForXmlDecl();
   }
 
   private int readAsMuchAsPossible(char[] buf, int startAt, int len) throws IOException {
     int total = 0;
     while (total < len) {
       int count = read(buf, startAt + total, len - total);
       if (count < 0)
         break;
       total += count;
     }
     return total;
   }
 
   // BUGBUG in JDK: Charset.forName is not threadsafe, so we'll prime it
   // with the common charsets.
 
   private static Charset dummy1 = Charset.forName("UTF-8");
 
   private static Charset dummy2 = Charset.forName("UTF-16");
 
   private static Charset dummy3 = Charset.forName("UTF-16BE");
 
   private static Charset dummy4 = Charset.forName("UTF-16LE");
 
   private static Charset dummy5 = Charset.forName("ISO-8859-1");
 
   private static Charset dummy6 = Charset.forName("US-ASCII");
 
   private static Charset dummy7 = Charset.forName("Cp1252");
 
   private String sniffForXmlDecl() throws IOException {
     mark(MAX_SNIFFED_CHARS);
     try {
       char[] buf = new char[MAX_SNIFFED_CHARS];
       int limit = readAsMuchAsPossible(buf, 0, MAX_SNIFFED_CHARS);
       return SniffedXmlInputStream.extractXmlDeclEncoding(buf, 0, limit);
     } finally {
       reset();
     }
   }
 
   private String _encoding;
 
   public String getXmlEncoding() {
     return _encoding;
   }
 }
 
 class SniffedXmlInputStream extends BufferedInputStream
 {
     // We don't sniff more than 192 bytes.
     public static int MAX_SNIFFED_BYTES = 192;
 
     public SniffedXmlInputStream(InputStream stream) throws IOException
     {
         super(stream);
 
         // read byte order marks and detect EBCDIC etc
         _encoding = sniffFourBytes();
 
         if (_encoding != null && _encoding.equals("IBM037"))
         {
             // First four bytes suggest EBCDIC with <?xm at start
             String encoding = sniffForXmlDecl(_encoding);
             if (encoding != null)
                 _encoding = encoding;
         }
 
         if (_encoding == null)
         {
             // Haven't yet determined encoding: sniff for <?xml encoding="..."?>
             // assuming we can read it as UTF-8.
             _encoding = sniffForXmlDecl("UTF-8");
         }
 
         if (_encoding == null)
         {
             // The XML spec says these two things:
 
             // (1) "In the absence of external character encoding information
             // (such as MIME headers), parsed entities which are stored in an
             // encoding other than UTF-8 or UTF-16 must begin with a text
             // declaration (see 4.3.1 The Text Declaration) containing an
             // encoding declaration:"
 
             // (2) "In the absence of information provided by an external
             // transport protocol (e.g. HTTP or MIME), it is an error
             // for an entity including an encoding declaration to be
             // presented to the XML processor in an encoding other than
             // that named in the declaration, or for an entity which begins
             // with neither a Byte Order Mark nor an encoding declaration
             // to use an encoding other than UTF-8."
 
             // Since we're using a sniffed stream, we do not have external
             // character encoding information.
 
             // Since we're here, we also don't have a recognized byte order
             // mark or an explicit encoding declaration that can be read in
             // either ASCII or EBDIC style.
 
             // Therefore, we must use UTF-8.
 
             _encoding = "UTF-8";
         }
     }
 
     private int readAsMuchAsPossible(byte[] buf, int startAt, int len) throws IOException
     {
         int total = 0;
         while (total < len)
         {
             int count = read(buf, startAt + total, len - total);
             if (count < 0)
                 break;
             total += count;
         }
         return total;
     }
 
     private String sniffFourBytes() throws IOException
     {
         mark(4);
         int skip = 0;
         try
         {
             byte[] buf = new byte[4];
             if (readAsMuchAsPossible(buf, 0, 4) < 4)
                 return null;
             long result = 0xFF000000 & (buf[0] << 24) | 0x00FF0000 & (buf[1] << 16) | 0x0000FF00 & (buf[2] << 8) | 0x000000FF & buf[3];
 
             if (result == 0x0000FEFF)
                 return "UCS-4";
             else if (result == 0xFFFE0000)
                 return "UCS-4";
             else if (result == 0x0000003C)
                 return "UCS-4BE";
             else if (result == 0x3C000000)
                 return "UCS-4LE";
             else if (result == 0x003C003F)
                 return "UTF-16BE";
             else if (result == 0x3C003F00)
                 return "UTF-16LE";
             else if (result == 0x3C3F786D)
                 return null; // looks like US-ASCII with <?xml: sniff
             else if (result == 0x4C6FA794)
                 return "IBM037"; // Sniff for ebdic codepage
             else if ((result & 0xFFFF0000) == 0xFEFF0000)
                 return "UTF-16";
             else if ((result & 0xFFFF0000) == 0xFFFE0000)
                 return "UTF-16";
             else if ((result & 0xFFFFFF00) == 0xEFBBBF00)
                 return "UTF-8";
             else return null;
         }
         finally
         {
             reset();
         }
     }
 
     // BUGBUG in JDK: Charset.forName is not threadsafe, so we'll prime it
     // with the common charsets.
 
     private static Charset dummy1 = Charset.forName("UTF-8");
     private static Charset dummy2 = Charset.forName("UTF-16");
     private static Charset dummy3 = Charset.forName("UTF-16BE");
     private static Charset dummy4 = Charset.forName("UTF-16LE");
     private static Charset dummy5 = Charset.forName("ISO-8859-1");
     private static Charset dummy6 = Charset.forName("US-ASCII");
     private static Charset dummy7 = Charset.forName("Cp1252");
 
 
     private String sniffForXmlDecl(String encoding) throws IOException
     {
         mark(MAX_SNIFFED_BYTES);
         try
         {
             byte[] bytebuf = new byte[MAX_SNIFFED_BYTES];
             int bytelimit = readAsMuchAsPossible(bytebuf, 0, MAX_SNIFFED_BYTES);
 
             // BUGBUG in JDK: Charset.forName is not threadsafe.
             Charset charset = Charset.forName(encoding);
             Reader reader = new InputStreamReader(new ByteArrayInputStream(bytebuf, 0, bytelimit), charset);
             char[] buf = new char[bytelimit];
             int limit = 0;
             while (limit < bytelimit)
             {
                 int count = reader.read(buf, limit, bytelimit - limit);
                 if (count < 0)
                     break;
                 limit += count;
             }
 
             return extractXmlDeclEncoding(buf, 0, limit);
         }
         finally
         {
             reset();
         }
     }
 
     private String _encoding;
 
     public String getXmlEncoding()
     {
         return _encoding;
     }
 
     /* package */ static String extractXmlDeclEncoding(char[] buf, int offset, int size)
     {
         int limit = offset + size;
         int xmlpi = firstIndexOf("<?xml", buf, offset, limit);
         if (xmlpi >= 0)
         {
             int i = xmlpi + 5;
             ScannedAttribute attr = new ScannedAttribute();
             while (i < limit)
             {
                 i = scanAttribute(buf, i, limit, attr);
                 if (i < 0)
                     return null;
                 if (attr.name.equals("encoding"))
                     return attr.value;
             }
         }
         return null;
     }
 
     private static int firstIndexOf(String s, char[] buf, int startAt, int limit)
     {
         assert(s.length() > 0);
         char[] lookFor = s.toCharArray();
 
         char firstchar = lookFor[0];
         searching: for (limit -= lookFor.length; startAt < limit; startAt++)
         {
             if (buf[startAt] == firstchar)
             {
                 for (int i = 1; i < lookFor.length; i++)
                 {
                     if (buf[startAt + i] != lookFor[i])
                     {
                         continue searching;
                     }
                 }
                 return startAt;
             }
         }
 
         return -1;
     }
 
     private static int nextNonmatchingByte(char[] lookFor, char[] buf, int startAt, int limit)
     {
         searching: for (; startAt < limit; startAt++)
         {
             int thischar = buf[startAt];
             for (int i = 0; i < lookFor.length; i++)
                 if (thischar == lookFor[i])
                     continue searching;
             return startAt;
         }
         return -1;
     }
 
     private static int nextMatchingByte(char[] lookFor, char[] buf, int startAt, int limit)
     {
         searching: for (; startAt < limit; startAt++)
         {
             int thischar = buf[startAt];
             for (int i = 0; i < lookFor.length; i++)
                 if (thischar == lookFor[i])
                     return startAt;
         }
         return -1;
     }
 
     private static int nextMatchingByte(char lookFor, char[] buf, int startAt, int limit)
     {
         searching: for (; startAt < limit; startAt++)
         {
             if (bufBlogThis!Share to XShare to FacebookShare to Pinterest
Source code and example code online

Saturday, 30 July 2011

Sniffed Xml Reader

No comments:

Post a Comment