00001 /* 00002 * The Apache Software License, Version 1.1 00003 * 00004 * Copyright (c) 1999-2000 The Apache Software Foundation. All rights 00005 * reserved. 00006 * 00007 * Redistribution and use in source and binary forms, with or without 00008 * modification, are permitted provided that the following conditions 00009 * are met: 00010 * 00011 * 1. Redistributions of source code must retain the above copyright 00012 * notice, this list of conditions and the following disclaimer. 00013 * 00014 * 2. Redistributions in binary form must reproduce the above copyright 00015 * notice, this list of conditions and the following disclaimer in 00016 * the documentation and/or other materials provided with the 00017 * distribution. 00018 * 00019 * 3. The end-user documentation included with the redistribution, 00020 * if any, must include the following acknowledgment: 00021 * "This product includes software developed by the 00022 * Apache Software Foundation (http://www.apache.org/)." 00023 * Alternately, this acknowledgment may appear in the software itself, 00024 * if and wherever such third-party acknowledgments normally appear. 00025 * 00026 * 4. The names "Xerces" and "Apache Software Foundation" must 00027 * not be used to endorse or promote products derived from this 00028 * software without prior written permission. For written 00029 * permission, please contact apache\@apache.org. 00030 * 00031 * 5. Products derived from this software may not be called "Apache", 00032 * nor may "Apache" appear in their name, without prior written 00033 * permission of the Apache Software Foundation. 00034 * 00035 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED 00036 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 00037 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 00038 * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR 00039 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 00040 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 00041 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF 00042 * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 00043 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 00044 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 00045 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 00046 * SUCH DAMAGE. 00047 * ==================================================================== 00048 * 00049 * This software consists of voluntary contributions made by many 00050 * individuals on behalf of the Apache Software Foundation, and was 00051 * originally based on software copyright (c) 1999, International 00052 * Business Machines, Inc., http://www.ibm.com . For more information 00053 * on the Apache Software Foundation, please see 00054 * <http://www.apache.org/>. 00055 */ 00056 00057 /* 00058 * $Id: XMLRecognizer.hpp,v 1.1 2002/05/11 20:04:08 bhavani Exp $ 00059 */ 00060 00061 #if !defined(XMLRECOGNIZER_HPP) 00062 #define XMLRECOGNIZER_HPP 00063 00072 class XMLPARSER_EXPORT XMLRecognizer 00073 { 00074 public : 00075 // ----------------------------------------------------------------------- 00076 // Class types 00077 // 00078 // This enum represents the various encoding families that we have to 00079 // deal with individually at the scanner level. This does not indicate 00080 // the exact encoding, just the rough family that would let us scan 00081 // the XML/TextDecl to find the encoding string. 00082 // 00083 // The 'L's and 'B's stand for little or big endian. We conditionally 00084 // create versions that will automatically map to the local UTF-16 and 00085 // UCS-4 endian modes. 00086 // 00087 // OtherEncoding means that its some transcoder based encoding, i.e. not 00088 // one of the ones that we do internally. Its a special case and should 00089 // never be used directly outside of the reader. 00090 // 00091 // NOTE: Keep this in sync with the name map array in the Cpp file!! 00092 // ----------------------------------------------------------------------- 00093 enum Encodings 00094 { 00095 EBCDIC = 0 00096 , UCS_4B = 1 00097 , UCS_4L = 2 00098 , US_ASCII = 3 00099 , UTF_8 = 4 00100 , UTF_16B = 5 00101 , UTF_16L = 6 00102 00103 , Encodings_Count 00104 , Encodings_Min = EBCDIC 00105 , Encodings_Max = UTF_16L 00106 00107 , OtherEncoding = 999 00108 00109 #if defined(ENDIANMODE_BIG) 00110 , Def_UTF16 = UTF_16B 00111 , Def_UCS4 = UCS_4B 00112 #else 00113 , Def_UTF16 = UTF_16L 00114 , Def_UCS4 = UCS_4L 00115 #endif 00116 }; 00117 00118 00119 // ----------------------------------------------------------------------- 00120 // Public, const static data 00121 // 00122 // These are the byte sequences for each of the encodings that we can 00123 // auto sense, and their lengths. 00124 // ----------------------------------------------------------------------- 00125 static const char fgASCIIPre[]; 00126 static const unsigned int fgASCIIPreLen; 00127 static const XMLByte fgEBCDICPre[]; 00128 static const unsigned int fgEBCDICPreLen; 00129 static const XMLByte fgUTF16BPre[]; 00130 static const XMLByte fgUTF16LPre[]; 00131 static const unsigned int fgUTF16PreLen; 00132 static const XMLByte fgUCS4BPre[]; 00133 static const XMLByte fgUCS4LPre[]; 00134 static const unsigned int fgUCS4PreLen; 00135 static const char fgUTF8BOM[]; 00136 static const unsigned int fgUTF8BOMLen; 00137 00138 00139 // ----------------------------------------------------------------------- 00140 // Encoding recognition methods 00141 // ----------------------------------------------------------------------- 00142 static Encodings basicEncodingProbe 00143 ( 00144 const XMLByte* const rawBuffer 00145 , const unsigned int rawByteCount 00146 ); 00147 00148 static Encodings encodingForName 00149 ( 00150 const XMLCh* const theEncName 00151 ); 00152 00153 static const XMLCh* nameForEncoding(const Encodings theEncoding); 00154 00155 00156 protected : 00157 // ----------------------------------------------------------------------- 00158 // Unimplemented constructors, operators, and destructor 00159 // 00160 // This class is effectively being used as a namespace for some static 00161 // methods. 00162 // 00163 // (these functions are protected rather than private only to get rid of 00164 // some annoying compiler warnings.) 00165 // 00166 // ----------------------------------------------------------------------- 00167 XMLRecognizer(); 00168 ~XMLRecognizer(); 00169 void operator=(const XMLRecognizer&); 00170 }; 00171 00172 #endif