00001 /* 00002 * The Apache Software License, Version 1.1 00003 * 00004 * Copyright (c) 1999-2001 The Apache Software Foundation. All rights 00005 * reserved. 00006 * 00007 * Redistribution and use in source and binary forms, with or without 00008 * modification, are permitted provided that the following conditions 00009 * are met: 00010 * 00011 * 1. Redistributions of source code must retain the above copyright 00012 * notice, this list of conditions and the following disclaimer. 00013 * 00014 * 2. Redistributions in binary form must reproduce the above copyright 00015 * notice, this list of conditions and the following disclaimer in 00016 * the documentation and/or other materials provided with the 00017 * distribution. 00018 * 00019 * 3. The end-user documentation included with the redistribution, 00020 * if any, must include the following acknowledgment: 00021 * "This product includes software developed by the 00022 * Apache Software Foundation (http://www.apache.org/)." 00023 * Alternately, this acknowledgment may appear in the software itself, 00024 * if and wherever such third-party acknowledgments normally appear. 00025 * 00026 * 4. The names "Xerces" and "Apache Software Foundation" must 00027 * not be used to endorse or promote products derived from this 00028 * software without prior written permission. For written 00029 * permission, please contact apache\@apache.org. 00030 * 00031 * 5. Products derived from this software may not be called "Apache", 00032 * nor may "Apache" appear in their name, without prior written 00033 * permission of the Apache Software Foundation. 00034 * 00035 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED 00036 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 00037 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 00038 * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR 00039 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 00040 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 00041 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF 00042 * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 00043 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 00044 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 00045 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 00046 * SUCH DAMAGE. 00047 * ==================================================================== 00048 * 00049 * This software consists of voluntary contributions made by many 00050 * individuals on behalf of the Apache Software Foundation, and was 00051 * originally based on software copyright (c) 1999, International 00052 * Business Machines, Inc., http://www.ibm.com . For more information 00053 * on the Apache Software Foundation, please see 00054 * <http://www.apache.org/>. 00055 */ 00056 00057 /* 00058 * $Log: XMLReader.hpp,v $ 00059 * Revision 1.1 2002/05/11 20:19:15 bhavani 00060 * CR#CR062582# adding xercesc 1.7 file 00061 * 00062 * Revision 1.1.1.1 2002/02/01 22:22:02 peiyongz 00063 * sane_include 00064 * 00065 * Revision 1.18 2001/12/06 17:47:04 tng 00066 * Performance Enhancement. Modify the handling of the fNEL option so that it results in fgCharCharsTable being modified, instead of having all of the low-level routines check the option. This seemed acceptable because the code appears to only permit the option to be turned on and not turned off again. By Henry Zongaro. 00067 * 00068 * Revision 1.17 2001/07/12 18:50:13 tng 00069 * Some performance modification regarding standalone check and xml decl check. 00070 * 00071 * Revision 1.16 2001/05/11 13:26:17 tng 00072 * Copyright update. 00073 * 00074 * Revision 1.15 2001/05/03 18:42:51 knoaman 00075 * Added new option to the parsers so that the NEL (0x85) char can be treated as a newline character. 00076 * 00077 * Revision 1.14 2001/01/25 19:16:58 tng 00078 * const should be used instead of static const. Fixed by Khaled Noaman. 00079 * 00080 * Revision 1.13 2000/07/25 22:33:05 aruna1 00081 * Char definitions in XMLUni moved to XMLUniDefs 00082 * 00083 * Revision 1.12 2000/07/08 00:17:13 andyh 00084 * Cleanup of yesterday's speedup changes. Merged new bit into the 00085 * scanner character properties table. 00086 * 00087 * Revision 1.11 2000/07/07 01:08:44 andyh 00088 * Parser speed up in scan of XML content. 00089 * 00090 * Revision 1.10 2000/07/06 21:00:52 jpolast 00091 * inlined getNextCharIfNot() for better performance 00092 * 00093 * Revision 1.9 2000/05/11 23:11:33 andyh 00094 * Add missing validity checks for stand-alone documents, character range 00095 * and Well-formed parsed entities. Changes contributed by Sean MacRoibeaird 00096 * <sean.Macroibeaird@ireland.sun.com> 00097 * 00098 * Revision 1.8 2000/03/02 19:54:29 roddey 00099 * This checkin includes many changes done while waiting for the 00100 * 1.1.0 code to be finished. I can't list them all here, but a list is 00101 * available elsewhere. 00102 * 00103 * Revision 1.7 2000/02/24 20:18:07 abagchi 00104 * Swat for removing Log from API docs 00105 * 00106 * Revision 1.6 2000/02/06 07:47:53 rahulj 00107 * Year 2K copyright swat. 00108 * 00109 * Revision 1.5 2000/01/25 01:04:21 roddey 00110 * Fixes a bogus error about ]]> in char data. 00111 * 00112 * Revision 1.4 2000/01/22 00:01:08 roddey 00113 * Simple change to get rid of two hard coded 'x' type characters, which won't 00114 * work on EBCDIC systems. 00115 * 00116 * Revision 1.3 1999/12/18 00:20:00 roddey 00117 * More changes to support the new, completely orthagonal, support for 00118 * intrinsic encodings. 00119 * 00120 * Revision 1.2 1999/12/15 19:48:03 roddey 00121 * Changed to use new split of transcoder interfaces into XML transcoders and 00122 * LCP transcoders, and implementation of intrinsic transcoders as pluggable 00123 * transcoders, and addition of Latin1 intrinsic support. 00124 * 00125 * Revision 1.1.1.1 1999/11/09 01:08:22 twl 00126 * Initial checkin 00127 * 00128 * Revision 1.3 1999/11/08 20:44:47 rahul 00129 * Swat for adding in Product name and CVS comment log variable. 00130 * 00131 */ 00132 00133 #if !defined(XMLREADER_HPP) 00134 #define XMLREADER_HPP 00135 00136 #include <xercesc/util/XMLUniDefs.hpp> 00137 #include <xercesc/framework/XMLRecognizer.hpp> 00138 #include <xercesc/framework/XMLBuffer.hpp> 00139 #include <xercesc/util/XMLUniDefs.hpp> 00140 #include <fstream.h> 00141 #include <iostream.h> 00142 00143 class InputSource; 00144 class BinInputStream; 00145 class ReaderMgr; 00146 class XMLBuffer; 00147 class XMLScanner; 00148 class XMLTranscoder; 00149 00150 00151 // Masks for the fgCharCharsTable array 00152 const XMLByte gBaseCharMask = 0x1; 00153 const XMLByte gSpecialCharDataMask = 0x2; 00154 const XMLByte gNameCharMask = 0x4; 00155 const XMLByte gPlainContentCharMask = 0x8; 00156 const XMLByte gSpecialStartTagCharMask = 0x10; 00157 const XMLByte gLetterCharMask = 0x20; 00158 const XMLByte gXMLCharMask = 0x40; 00159 const XMLByte gWhitespaceCharMask = 0x80; 00160 00161 00162 // --------------------------------------------------------------------------- 00163 // Instances of this class are used to manage the content of entities. The 00164 // scanner maintains a stack of these, one for each entity (this means entity 00165 // in the sense of any parsed file or internal entity) currently being 00166 // scanned. This class, given a binary input stream will handle reading in 00167 // the data and decoding it from its external decoding into the internal 00168 // Unicode format. Once internallized, this class provides the access 00169 // methods to read in the data in various ways, maintains line and column 00170 // information, and provides high performance character attribute checking 00171 // methods. 00172 // 00173 // This is NOT to be derived from. 00174 // 00175 // --------------------------------------------------------------------------- 00176 class XMLPARSER_EXPORT XMLReader 00177 { 00178 public: 00179 // ----------------------------------------------------------------------- 00180 // Public types 00181 // ----------------------------------------------------------------------- 00182 enum Types 00183 { 00184 Type_PE 00185 , Type_General 00186 }; 00187 00188 enum Sources 00189 { 00190 Source_Internal 00191 , Source_External 00192 }; 00193 00194 enum RefFrom 00195 { 00196 RefFrom_Literal 00197 , RefFrom_NonLiteral 00198 }; 00199 00200 00201 // ----------------------------------------------------------------------- 00202 // Public, static methods 00203 // ----------------------------------------------------------------------- 00204 static bool isAllSpaces 00205 ( 00206 const XMLCh* const toCheck 00207 , const unsigned int count 00208 ); 00209 00210 static bool containsWhiteSpace 00211 ( 00212 const XMLCh* const toCheck 00213 , const unsigned int count 00214 ); 00215 00216 00217 static bool isBaseChar(const XMLCh toCheck); 00218 static bool isFirstNameChar(const XMLCh toCheck); 00219 static bool isNameChar(const XMLCh toCheck); 00220 static bool isPlainContentChar(const XMLCh toCheck); 00221 static bool isPublicIdChar(const XMLCh toCheck); 00222 static bool isSpecialCharDataChar(const XMLCh toCheck); 00223 static bool isSpecialStartTagChar(const XMLCh toCheck); 00224 static bool isXMLLetter(const XMLCh toCheck); 00225 static bool isXMLChar(const XMLCh toCheck); 00226 static bool isWhitespace(const XMLCh toCheck); 00227 00231 static bool isNELRecognized(); 00232 00233 00234 // ----------------------------------------------------------------------- 00235 // Constructors and Destructor 00236 // ----------------------------------------------------------------------- 00237 XMLReader 00238 ( 00239 const XMLCh* const pubId 00240 , const XMLCh* const sysId 00241 , BinInputStream* const streamToAdopt 00242 , const RefFrom from 00243 , const Types type 00244 , const Sources source 00245 , const bool throwAtEnd = false 00246 , const bool urlCaching = false 00247 , const XMLCh* const diskCachePath = 0 00248 ); 00249 00250 XMLReader 00251 ( 00252 const XMLCh* const pubId 00253 , const XMLCh* const sysId 00254 , BinInputStream* const streamToAdopt 00255 , const XMLCh* const encodingStr 00256 , const RefFrom from 00257 , const Types type 00258 , const Sources source 00259 , const bool throwAtEnd = false 00260 , const bool urlCaching = false 00261 , const XMLCh* const diskCachePath = 0 00262 ); 00263 00264 ~XMLReader(); 00265 00266 00267 // ----------------------------------------------------------------------- 00268 // Character buffer management methods 00269 // ----------------------------------------------------------------------- 00270 unsigned long charsLeftInBuffer() const; 00271 bool refreshCharBuffer(); 00272 00273 00274 // ----------------------------------------------------------------------- 00275 // Scanning methods 00276 // ----------------------------------------------------------------------- 00277 bool getName(XMLBuffer& toFill, const bool token); 00278 bool getNextChar(XMLCh& chGotten); 00279 bool getNextCharIfNot(const XMLCh chNotToGet, XMLCh& chGotten); 00280 void movePlainContentChars(XMLBuffer &dest); 00281 bool getSpaces(XMLBuffer& toFill); 00282 bool getUpToCharOrWS(XMLBuffer& toFill, const XMLCh toCheck); 00283 bool peekNextChar(XMLCh& chGotten); 00284 bool skipIfQuote(XMLCh& chGotten); 00285 bool skipSpaces(bool& skippedSomething); 00286 bool skippedChar(const XMLCh toSkip); 00287 bool skippedSpace(); 00288 bool skippedString(const XMLCh* const toSkip); 00289 bool peekString(const XMLCh* const toPeek); 00290 00291 00292 // ----------------------------------------------------------------------- 00293 // Getter methods 00294 // ----------------------------------------------------------------------- 00295 unsigned int getColumnNumber() const; 00296 const XMLCh* getEncodingStr() const; 00297 unsigned int getLineNumber() const; 00298 bool getNoMoreFlag() const; 00299 const XMLCh* getPublicId() const; 00300 unsigned int getReaderNum() const; 00301 RefFrom getRefFrom() const; 00302 Sources getSource() const; 00303 unsigned int getSrcOffset() const; 00304 const XMLCh* getSystemId() const; 00305 bool getThrowAtEnd() const; 00306 Types getType() const; 00307 bool getURLEntityCaching() const; 00308 00309 00310 // ----------------------------------------------------------------------- 00311 // Setter methods 00312 // ----------------------------------------------------------------------- 00313 bool setEncoding 00314 ( 00315 const XMLCh* const newEncoding 00316 ); 00317 void setReaderNum(const unsigned int newNum); 00318 void setThrowAtEnd(const bool newValue); 00319 00320 00321 private: 00322 // --------------------------------------------------------------------------- 00323 // Class Constants 00324 // 00325 // kCharBufSize 00326 // The size of the character spool buffer that we use. Its not terribly 00327 // large because its just getting filled with data from a raw byte 00328 // buffer as we go along. We don't want to decode all the text at 00329 // once before we find out that there is an error. 00330 // 00331 // NOTE: This is a size in characters, not bytes. 00332 // 00333 // kRawBufSize 00334 // The size of the raw buffer from which raw bytes are spooled out 00335 // as we transcode chunks of data. As it is emptied, it is filled back 00336 // in again from the source stream. 00337 // --------------------------------------------------------------------------- 00338 enum Constants 00339 { 00340 kCharBufSize = 16 * 1024 00341 , kRawBufSize = 48 * 1024 00342 }; 00343 00344 00345 // ----------------------------------------------------------------------- 00346 // Private static methods 00347 // ----------------------------------------------------------------------- 00348 static bool checkTable 00349 ( 00350 const XMLCh* const theTable 00351 , const XMLCh toCheck 00352 ); 00353 00354 00358 static void enableNELWS(); 00359 00360 // ----------------------------------------------------------------------- 00361 // Private helper methods 00362 // ----------------------------------------------------------------------- 00363 void checkForSwapped(); 00364 00365 void doInitCharSizeChecks(); 00366 00367 void doInitDecode(); 00368 00369 XMLByte getNextRawByte 00370 ( 00371 const bool eoiOk 00372 ); 00373 00374 void refreshRawBuffer(); 00375 00376 void writeToFile(); 00377 00378 void setTranscoder 00379 ( 00380 const XMLCh* const newEncoding 00381 ); 00382 00383 unsigned int xcodeMoreChars 00384 ( 00385 XMLCh* const bufToFill 00386 , unsigned char* const charSizes 00387 , const unsigned int maxChars 00388 ); 00389 00390 00391 // ----------------------------------------------------------------------- 00392 // Data members 00393 // 00394 // fCharIndex 00395 // The index into the character buffer. When this hits fCharsAvail 00396 // then its time to refill. 00397 // 00398 // fCharBuf 00399 // A buffer that the reader manager fills up with transcoded 00400 // characters a small amount at a time. 00401 // 00402 // fCharsAvail 00403 // The characters currently available in the character buffer. 00404 // 00405 // fCharSizeBuf 00406 // This buffer is an array that contains the number of source chars 00407 // eaten to create each char in the fCharBuf buffer. So the entry 00408 // fCharSizeBuf[x] is the number of source chars that were eaten 00409 // to make the internalized char fCharBuf[x]. This only contains 00410 // useful data if fSrcOfsSupported is true. 00411 // 00412 // fCurCol 00413 // fCurLine 00414 // The current line and column that we are in within this reader's 00415 // text. 00416 // 00417 // fEncoding 00418 // This is the rough encoding setting. This enum is set during 00419 // construction and just tells us the rough family of encoding that 00420 // we are doing. 00421 // 00422 // fEncodingStr 00423 // This is the name of the encoding we are using. It will be 00424 // provisionally set during construction, from the auto-sensed 00425 // encoding. But it might be overridden when the XMLDecl is finally 00426 // seen by the scanner. It can also be forced to a particular 00427 // encoding, in which case fForcedEncoding is set. 00428 // 00429 // fForcedEncoding 00430 // If the encoding if forced then this is set and all other 00431 // information will be ignored. This encoding will be taken as 00432 // gospel. This is done by calling an alternate constructor. 00433 // 00434 // fNoMore 00435 // This is set when the source text is exhausted. It lets us know 00436 // quickly that no more text is available. 00437 // 00438 // fRawBufIndex 00439 // The current index into the raw byte buffer. When its equal to 00440 // fRawBytesAvail then we need to read another buffer. 00441 // 00442 // fRawByteBuf 00443 // This is the raw byte buffer that is used to spool out bytes 00444 // from into the fCharBuf buffer, as we transcode in blocks. 00445 // 00446 // fRawBytesAvail 00447 // The number of bytes currently available in the raw buffer. This 00448 // helps deal with the last buffer's worth, which will usually not 00449 // be a full one. 00450 // 00451 // fReaderNum 00452 // Each reader from a particular reader manager (which means from a 00453 // particular document) is given a unique number. The reader manager 00454 // sets these numbers. They are used to catch things like partial 00455 // markup errors. 00456 // 00457 // fRefFrom 00458 // This flag is provided in the ctor, and tells us if we represent 00459 // some entity being expanded inside a literal. Sometimes things 00460 // happen differently inside and outside literals. 00461 // 00462 // fPublicId 00463 // fSystemId 00464 // These are the system and public ids of the source that this 00465 // reader is reading. 00466 // 00467 // fSentTrailingSpace 00468 // If we are a PE entity being read and we not referenced from a 00469 // literal, then a leading and trailing space must be faked into the 00470 // data. This lets us know we've done the trailing space already (so 00471 // we don't just keep doing it again and again.) 00472 // 00473 // fSource 00474 // Indicates whether the content this reader is spooling as already 00475 // been internalized. This will prevent multiple processing of 00476 // whitespace when an already internalized entity is being spooled 00477 // out. 00478 // 00479 // fSpareChar 00480 // Some encodings can create two chars in an atomic way, e.g. 00481 // surrogate pairs. We might not be able to store both, so we store 00482 // it here until the next buffer transcoding operation. 00483 // 00484 // fSrcOfsBase 00485 // This is the base offset within the source of this entity. Values 00486 // in the curent fCharSizeBuf array are relative to this value. 00487 // 00488 // fSrcOfsSupported 00489 // This flag is set to indicate whether source byte offset info 00490 // is supported. For intrinsic encodings, its always set since we 00491 // can always support it. For transcoder based encodings, we ask 00492 // the transcoder if it supports it or not. 00493 // 00494 // fStream 00495 // This is the input stream that provides the data for the reader. 00496 // Its always treated as a raw byte stream. The derived class will 00497 // ask for buffers of text from it and will handle making some 00498 // sense of it. 00499 // 00500 // fSwapped 00501 // If the encoding is one of the ones we do intrinsically, and its 00502 // in a different byte order from our native order, then this is 00503 // set to remind us to byte swap it during transcoding. 00504 // 00505 // fThrowAtEnd 00506 // Indicates whether the reader manager should throw an end of entity 00507 // exception at the end of this reader instance. This is usually 00508 // set for top level external entity references. It overrides the 00509 // reader manager's global flag that controls throwing at the end 00510 // of entities. Defaults to false. 00511 // 00512 // fTranscoder 00513 // If the encoding is not one that we handle intrinsically, then 00514 // we use an an external transcoder to do it. This class is an 00515 // abstraction that allows us to use pluggable external transcoding 00516 // services (via XMLTransService in util.) 00517 // 00518 // fType 00519 // Indicates whether this reader represents a PE or not. If this 00520 // flag is true and the fInLiteral flag is false, then we will put 00521 // out an extra space at the end. 00522 // 00523 // fURLEntityCaching 00524 // Indicates whether the External entity files are to be cached to disk 00525 // or not. The default is to cache it. 00526 // 00527 // ----------------------------------------------------------------------- 00528 unsigned int fCharIndex; 00529 XMLCh fCharBuf[kCharBufSize]; 00530 unsigned int fCharsAvail; 00531 unsigned char fCharSizeBuf[kCharBufSize]; 00532 unsigned int fCurCol; 00533 unsigned int fCurLine; 00534 XMLRecognizer::Encodings fEncoding; 00535 XMLCh* fEncodingStr; 00536 bool fForcedEncoding; 00537 bool fNoMore; 00538 XMLCh* fPublicId; 00539 unsigned int fRawBufIndex; 00540 XMLByte fRawByteBuf[kRawBufSize]; 00541 unsigned int fRawBytesAvail; 00542 unsigned int fReaderNum; 00543 RefFrom fRefFrom; 00544 bool fSentTrailingSpace; 00545 Sources fSource; 00546 XMLCh fSpareCh; 00547 unsigned int fSrcOfsBase; 00548 bool fSrcOfsSupported; 00549 XMLCh* fSystemId; 00550 BinInputStream* fStream; 00551 bool fSwapped; 00552 bool fThrowAtEnd; 00553 XMLTranscoder* fTranscoder; 00554 Types fType; 00555 bool fURLEntityCaching; 00556 char* fDiskCachePath; 00557 ofstream fDiskCacheFile; 00558 00559 00560 // ----------------------------------------------------------------------- 00561 // Static data members 00562 // 00563 // fgCharCharsTable 00564 // The character characteristics table. Bits in each byte, represent 00565 // the characteristics of each character. It is generated via some 00566 // code and then hard coded into the cpp file for speed. 00567 // 00568 // fNEL 00569 // Flag to respresents whether NEL whitespace recognition is enabled 00570 // or disabled 00571 // ----------------------------------------------------------------------- 00572 static XMLByte fgCharCharsTable[0x10000]; 00573 static bool fNEL; 00574 00575 friend class XMLPlatformUtils; 00576 }; 00577 00578 00579 // --------------------------------------------------------------------------- 00580 // XMLReader: Public, static methods 00581 // --------------------------------------------------------------------------- 00582 inline bool XMLReader::isBaseChar(const XMLCh toCheck) 00583 { 00584 return ((fgCharCharsTable[toCheck] & gBaseCharMask) != 0); 00585 } 00586 00587 inline bool XMLReader::isNameChar(const XMLCh toCheck) 00588 { 00589 return ((fgCharCharsTable[toCheck] & gNameCharMask) != 0); 00590 } 00591 00592 inline bool XMLReader::isPlainContentChar(const XMLCh toCheck) 00593 { 00594 return ((fgCharCharsTable[toCheck] & gPlainContentCharMask) != 0); 00595 } 00596 00597 00598 inline bool XMLReader::isSpecialCharDataChar(const XMLCh toCheck) 00599 { 00600 return ((fgCharCharsTable[toCheck] & gSpecialCharDataMask) != 0); 00601 } 00602 00603 inline bool XMLReader::isSpecialStartTagChar(const XMLCh toCheck) 00604 { 00605 return ((fgCharCharsTable[toCheck] & gSpecialStartTagCharMask) != 0); 00606 } 00607 00608 inline bool XMLReader::isXMLChar(const XMLCh toCheck) 00609 { 00610 return ((fgCharCharsTable[toCheck] & gXMLCharMask) != 0); 00611 } 00612 00613 inline bool XMLReader::isXMLLetter(const XMLCh toCheck) 00614 { 00615 const XMLByte ourMask = gBaseCharMask | gLetterCharMask; 00616 return ((fgCharCharsTable[toCheck] & ourMask) != 0); 00617 } 00618 00619 inline bool XMLReader::isWhitespace(const XMLCh toCheck) 00620 { 00621 return ((fgCharCharsTable[toCheck] & gWhitespaceCharMask) != 0); 00622 } 00623 00624 // --------------------------------------------------------------------------- 00625 // XMLReader: Buffer management methods 00626 // --------------------------------------------------------------------------- 00627 inline unsigned long XMLReader::charsLeftInBuffer() const 00628 { 00629 return fCharsAvail - fCharIndex; 00630 } 00631 00632 00633 // --------------------------------------------------------------------------- 00634 // XMLReader: Getter methods 00635 // --------------------------------------------------------------------------- 00636 inline unsigned int XMLReader::getColumnNumber() const 00637 { 00638 return fCurCol; 00639 } 00640 00641 inline const XMLCh* XMLReader::getEncodingStr() const 00642 { 00643 return fEncodingStr; 00644 } 00645 00646 inline unsigned int XMLReader::getLineNumber() const 00647 { 00648 return fCurLine; 00649 } 00650 00651 inline bool XMLReader::getNoMoreFlag() const 00652 { 00653 return fNoMore; 00654 } 00655 00656 inline const XMLCh* XMLReader::getPublicId() const 00657 { 00658 return fPublicId; 00659 } 00660 00661 inline unsigned int XMLReader::getReaderNum() const 00662 { 00663 return fReaderNum; 00664 } 00665 00666 inline XMLReader::RefFrom XMLReader::getRefFrom() const 00667 { 00668 return fRefFrom; 00669 } 00670 00671 inline XMLReader::Sources XMLReader::getSource() const 00672 { 00673 return fSource; 00674 } 00675 00676 inline const XMLCh* XMLReader::getSystemId() const 00677 { 00678 return fSystemId; 00679 } 00680 00681 inline bool XMLReader::getThrowAtEnd() const 00682 { 00683 return fThrowAtEnd; 00684 } 00685 00686 inline XMLReader::Types XMLReader::getType() const 00687 { 00688 return fType; 00689 } 00690 00691 inline bool XMLReader::isNELRecognized() { 00692 00693 return fNEL; 00694 } 00695 00696 inline bool XMLReader::getURLEntityCaching() const 00697 { 00698 return fURLEntityCaching; 00699 } 00700 00701 //inline const XMLCh* XMLReader::getdiskCachePath() const 00702 //{ 00703 // return fDiskCachePath; 00704 //} 00705 00706 // --------------------------------------------------------------------------- 00707 // XMLReader: Setter methods 00708 // --------------------------------------------------------------------------- 00709 inline void XMLReader::setReaderNum(const unsigned int newNum) 00710 { 00711 fReaderNum = newNum; 00712 } 00713 00714 inline void XMLReader::setThrowAtEnd(const bool newValue) 00715 { 00716 fThrowAtEnd = newValue; 00717 } 00718 00719 00720 00721 // --------------------------------------------------------------------------- 00722 // 00723 // XMLReader: movePlainContentChars() 00724 // 00725 // Move as many plain (no special handling of any sort required) content 00726 // characters as possible from this reader to the supplied destination buffer. 00727 // 00728 // This is THE hottest performance spot in the parser. 00729 // 00730 // --------------------------------------------------------------------------- 00731 inline void XMLReader::movePlainContentChars(XMLBuffer &dest) 00732 { 00733 int count = 0; 00734 XMLCh *pStart = &fCharBuf[fCharIndex]; 00735 XMLCh *pCurrent = pStart; 00736 XMLCh *pEnd = &fCharBuf[fCharsAvail]; 00737 00738 00739 while (pCurrent < pEnd) 00740 { 00741 if (! XMLReader::isPlainContentChar(*pCurrent++)) 00742 break; 00743 count++; 00744 } 00745 00746 if (count > 0) 00747 { 00748 fCharIndex += count; 00749 fCurCol += count; 00750 dest.append(pStart, count); 00751 } 00752 } 00753 00754 00755 00756 00757 // --------------------------------------------------------------------------- 00758 // XMLReader: getNextCharIfNot() method inlined for speed 00759 // --------------------------------------------------------------------------- 00760 inline bool XMLReader::getNextCharIfNot(const XMLCh chNotToGet, XMLCh& chGotten) 00761 { 00762 // 00763 // See if there is at least a char in the buffer. Else, do the buffer 00764 // reload logic. 00765 // 00766 if (fCharIndex < fCharsAvail) 00767 { 00768 // Check the next char 00769 if (fCharBuf[fCharIndex] == chNotToGet) 00770 return false; 00771 00772 // Its not the one we want to skip so bump the index 00773 chGotten = fCharBuf[fCharIndex++]; 00774 } 00775 else 00776 { 00777 // If fNoMore is set, then we have nothing else to give 00778 if (fNoMore) 00779 return false; 00780 00781 // If the buffer is empty, then try to refresh 00782 if (fCharIndex == fCharsAvail) 00783 { 00784 if (!refreshCharBuffer()) 00785 { 00786 // If still empty, then return false 00787 if (fCharIndex == fCharsAvail) 00788 return false; 00789 } 00790 } 00791 00792 // Check the next char 00793 if (fCharBuf[fCharIndex] == chNotToGet) 00794 return false; 00795 00796 // Its not the one we want to skip so bump the index 00797 chGotten = fCharBuf[fCharIndex++]; 00798 } 00799 00800 // Handle end of line normalization and line/col member maintenance. 00801 if (chGotten == chCR) 00802 { 00803 // 00804 // Do the normalization. We return chLF regardless of which was 00805 // found. We also eat a chCR followed by an chLF. 00806 // 00807 // We only do this if the content being spooled is not already 00808 // internalized. 00809 // 00810 if (fSource == Source_External) 00811 { 00812 // 00813 // See if we have another char left. If not, don't bother. 00814 // Else, see if its an chLF to eat. If it is, bump the 00815 // index again. 00816 // 00817 if (fCharIndex < fCharsAvail) 00818 { 00819 if (fCharBuf[fCharIndex] == chLF 00820 || ((fCharBuf[fCharIndex] == chNEL) && fNEL)) 00821 fCharIndex++; 00822 } 00823 else 00824 { 00825 if (refreshCharBuffer()) 00826 { 00827 if (fCharBuf[fCharIndex] == chLF 00828 || ((fCharBuf[fCharIndex] == chNEL) && fNEL)) 00829 fCharIndex++; 00830 } 00831 } 00832 00833 // And return just an chLF 00834 chGotten = chLF; 00835 } 00836 00837 // And handle the line/col stuff 00838 fCurCol = 1; 00839 fCurLine++; 00840 } 00841 else if (chGotten == chLF 00842 || ((chGotten == chNEL) && fNEL)) 00843 { 00844 chGotten = chLF; 00845 fCurLine++; 00846 fCurCol = 1; 00847 } 00848 else if (chGotten) 00849 { 00850 // 00851 // Only do this is not a null char. Null chars are not part of the 00852 // real content. They are just marker characters inserted into 00853 // the stream. 00854 // 00855 fCurCol++; 00856 } 00857 return true; 00858 } 00859 00860 #endif