Main Page   Class Hierarchy   Compound List   File List   Compound Members  

XMLReader.hpp

00001 /*
00002  * The Apache Software License, Version 1.1
00003  *
00004  * Copyright (c) 1999-2001 The Apache Software Foundation.  All rights
00005  * reserved.
00006  *
00007  * Redistribution and use in source and binary forms, with or without
00008  * modification, are permitted provided that the following conditions
00009  * are met:
00010  *
00011  * 1. Redistributions of source code must retain the above copyright
00012  *    notice, this list of conditions and the following disclaimer.
00013  *
00014  * 2. Redistributions in binary form must reproduce the above copyright
00015  *    notice, this list of conditions and the following disclaimer in
00016  *    the documentation and/or other materials provided with the
00017  *    distribution.
00018  *
00019  * 3. The end-user documentation included with the redistribution,
00020  *    if any, must include the following acknowledgment:
00021  *       "This product includes software developed by the
00022  *        Apache Software Foundation (http://www.apache.org/)."
00023  *    Alternately, this acknowledgment may appear in the software itself,
00024  *    if and wherever such third-party acknowledgments normally appear.
00025  *
00026  * 4. The names "Xerces" and "Apache Software Foundation" must
00027  *    not be used to endorse or promote products derived from this
00028  *    software without prior written permission. For written
00029  *    permission, please contact apache\@apache.org.
00030  *
00031  * 5. Products derived from this software may not be called "Apache",
00032  *    nor may "Apache" appear in their name, without prior written
00033  *    permission of the Apache Software Foundation.
00034  *
00035  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
00036  * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
00037  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
00038  * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
00039  * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
00040  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
00041  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
00042  * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
00043  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
00044  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
00045  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
00046  * SUCH DAMAGE.
00047  * ====================================================================
00048  *
00049  * This software consists of voluntary contributions made by many
00050  * individuals on behalf of the Apache Software Foundation, and was
00051  * originally based on software copyright (c) 1999, International
00052  * Business Machines, Inc., http://www.ibm.com .  For more information
00053  * on the Apache Software Foundation, please see
00054  * <http://www.apache.org/>.
00055  */
00056 
00057 /*
00058  * $Log: XMLReader.hpp,v $
00059  * Revision 1.1  2002/05/11 20:19:15  bhavani
00060  * CR#CR062582# adding xercesc 1.7 file
00061  *
00062  * Revision 1.1.1.1  2002/02/01 22:22:02  peiyongz
00063  * sane_include
00064  *
00065  * Revision 1.18  2001/12/06 17:47:04  tng
00066  * Performance Enhancement.  Modify the handling of the fNEL option so that it results in fgCharCharsTable being modified, instead of having all of the low-level routines check the option.  This seemed acceptable because the code appears to only permit the option to be turned on and not turned off again.   By Henry Zongaro.
00067  *
00068  * Revision 1.17  2001/07/12 18:50:13  tng
00069  * Some performance modification regarding standalone check and xml decl check.
00070  *
00071  * Revision 1.16  2001/05/11 13:26:17  tng
00072  * Copyright update.
00073  *
00074  * Revision 1.15  2001/05/03 18:42:51  knoaman
00075  * Added new option to the parsers so that the NEL (0x85) char can be treated as a newline character.
00076  *
00077  * Revision 1.14  2001/01/25 19:16:58  tng
00078  * const should be used instead of static const.  Fixed by Khaled Noaman.
00079  *
00080  * Revision 1.13  2000/07/25 22:33:05  aruna1
00081  * Char definitions in XMLUni moved to XMLUniDefs
00082  *
00083  * Revision 1.12  2000/07/08 00:17:13  andyh
00084  * Cleanup of yesterday's speedup changes.  Merged new bit into the
00085  * scanner character properties table.
00086  *
00087  * Revision 1.11  2000/07/07 01:08:44  andyh
00088  * Parser speed up in scan of XML content.
00089  *
00090  * Revision 1.10  2000/07/06 21:00:52  jpolast
00091  * inlined getNextCharIfNot() for better performance
00092  *
00093  * Revision 1.9  2000/05/11 23:11:33  andyh
00094  * Add missing validity checks for stand-alone documents, character range
00095  * and Well-formed parsed entities.  Changes contributed by Sean MacRoibeaird
00096  * <sean.Macroibeaird@ireland.sun.com>
00097  *
00098  * Revision 1.8  2000/03/02 19:54:29  roddey
00099  * This checkin includes many changes done while waiting for the
00100  * 1.1.0 code to be finished. I can't list them all here, but a list is
00101  * available elsewhere.
00102  *
00103  * Revision 1.7  2000/02/24 20:18:07  abagchi
00104  * Swat for removing Log from API docs
00105  *
00106  * Revision 1.6  2000/02/06 07:47:53  rahulj
00107  * Year 2K copyright swat.
00108  *
00109  * Revision 1.5  2000/01/25 01:04:21  roddey
00110  * Fixes a bogus error about ]]> in char data.
00111  *
00112  * Revision 1.4  2000/01/22 00:01:08  roddey
00113  * Simple change to get rid of two hard coded 'x' type characters, which won't
00114  * work on EBCDIC systems.
00115  *
00116  * Revision 1.3  1999/12/18 00:20:00  roddey
00117  * More changes to support the new, completely orthagonal, support for
00118  * intrinsic encodings.
00119  *
00120  * Revision 1.2  1999/12/15 19:48:03  roddey
00121  * Changed to use new split of transcoder interfaces into XML transcoders and
00122  * LCP transcoders, and implementation of intrinsic transcoders as pluggable
00123  * transcoders, and addition of Latin1 intrinsic support.
00124  *
00125  * Revision 1.1.1.1  1999/11/09 01:08:22  twl
00126  * Initial checkin
00127  *
00128  * Revision 1.3  1999/11/08 20:44:47  rahul
00129  * Swat for adding in Product name and CVS comment log variable.
00130  *
00131  */
00132 
00133 #if !defined(XMLREADER_HPP)
00134 #define XMLREADER_HPP
00135 
00136 #include <xercesc/util/XMLUniDefs.hpp>
00137 #include <xercesc/framework/XMLRecognizer.hpp>
00138 #include <xercesc/framework/XMLBuffer.hpp>
00139 #include <xercesc/util/XMLUniDefs.hpp>
00140 #include <fstream.h>
00141 #include <iostream.h>
00142 
00143 class InputSource;
00144 class BinInputStream;
00145 class ReaderMgr;
00146 class XMLBuffer;
00147 class XMLScanner;
00148 class XMLTranscoder;
00149 
00150 
00151 // Masks for the fgCharCharsTable array
00152 const XMLByte   gBaseCharMask               = 0x1;
00153 const XMLByte   gSpecialCharDataMask        = 0x2;
00154 const XMLByte   gNameCharMask               = 0x4;
00155 const XMLByte   gPlainContentCharMask       = 0x8;
00156 const XMLByte   gSpecialStartTagCharMask    = 0x10;
00157 const XMLByte   gLetterCharMask             = 0x20;
00158 const XMLByte   gXMLCharMask                = 0x40;
00159 const XMLByte   gWhitespaceCharMask         = 0x80;
00160 
00161 
00162 // ---------------------------------------------------------------------------
00163 //  Instances of this class are used to manage the content of entities. The
00164 //  scanner maintains a stack of these, one for each entity (this means entity
00165 //  in the sense of any parsed file or internal entity) currently being
00166 //  scanned. This class, given a binary input stream will handle reading in
00167 //  the data and decoding it from its external decoding into the internal
00168 //  Unicode format. Once internallized, this class provides the access
00169 //  methods to read in the data in various ways, maintains line and column
00170 //  information, and provides high performance character attribute checking
00171 //  methods.
00172 //
00173 //  This is NOT to be derived from.
00174 //
00175 // ---------------------------------------------------------------------------
00176 class XMLPARSER_EXPORT XMLReader
00177 {
00178 public:
00179     // -----------------------------------------------------------------------
00180     //  Public types
00181     // -----------------------------------------------------------------------
00182     enum Types
00183     {
00184         Type_PE
00185         , Type_General
00186     };
00187 
00188     enum Sources
00189     {
00190         Source_Internal
00191         , Source_External
00192     };
00193 
00194     enum RefFrom
00195     {
00196         RefFrom_Literal
00197         , RefFrom_NonLiteral
00198     };
00199 
00200 
00201     // -----------------------------------------------------------------------
00202     //  Public, static methods
00203     // -----------------------------------------------------------------------
00204     static bool isAllSpaces
00205     (
00206         const   XMLCh* const    toCheck
00207         , const unsigned int    count
00208     );
00209 
00210     static bool containsWhiteSpace
00211     (
00212         const   XMLCh* const    toCheck
00213         , const unsigned int    count
00214     );
00215 
00216 
00217     static bool isBaseChar(const XMLCh toCheck);
00218     static bool isFirstNameChar(const XMLCh toCheck);
00219     static bool isNameChar(const XMLCh toCheck);
00220     static bool isPlainContentChar(const XMLCh toCheck);
00221     static bool isPublicIdChar(const XMLCh toCheck);
00222     static bool isSpecialCharDataChar(const XMLCh toCheck);
00223     static bool isSpecialStartTagChar(const XMLCh toCheck);
00224     static bool isXMLLetter(const XMLCh toCheck);
00225     static bool isXMLChar(const XMLCh toCheck);
00226     static bool isWhitespace(const XMLCh toCheck);
00227 
00231     static bool isNELRecognized();
00232 
00233 
00234     // -----------------------------------------------------------------------
00235     //  Constructors and Destructor
00236     // -----------------------------------------------------------------------
00237     XMLReader
00238     (
00239         const   XMLCh* const                pubId
00240         , const XMLCh* const                sysId
00241         ,       BinInputStream* const       streamToAdopt
00242         , const RefFrom                     from
00243         , const Types                       type
00244         , const Sources                     source
00245         , const bool                        throwAtEnd = false
00246         , const bool                        urlCaching = false
00247         , const XMLCh* const                diskCachePath = 0
00248     );
00249 
00250     XMLReader
00251     (
00252         const   XMLCh* const                pubId
00253         , const XMLCh* const                sysId
00254         ,       BinInputStream* const       streamToAdopt
00255         , const XMLCh* const                encodingStr
00256         , const RefFrom                     from
00257         , const Types                       type
00258         , const Sources                     source
00259         , const bool                        throwAtEnd = false
00260         , const bool                        urlCaching = false
00261         , const XMLCh* const                diskCachePath = 0
00262     );
00263 
00264     ~XMLReader();
00265 
00266 
00267     // -----------------------------------------------------------------------
00268     //  Character buffer management methods
00269     // -----------------------------------------------------------------------
00270     unsigned long charsLeftInBuffer() const;
00271     bool refreshCharBuffer();
00272 
00273 
00274     // -----------------------------------------------------------------------
00275     //  Scanning methods
00276     // -----------------------------------------------------------------------
00277     bool getName(XMLBuffer& toFill, const bool token);
00278     bool getNextChar(XMLCh& chGotten);
00279     bool getNextCharIfNot(const XMLCh chNotToGet, XMLCh& chGotten);
00280     void movePlainContentChars(XMLBuffer &dest);
00281     bool getSpaces(XMLBuffer& toFill);
00282     bool getUpToCharOrWS(XMLBuffer& toFill, const XMLCh toCheck);
00283     bool peekNextChar(XMLCh& chGotten);
00284     bool skipIfQuote(XMLCh& chGotten);
00285     bool skipSpaces(bool& skippedSomething);
00286     bool skippedChar(const XMLCh toSkip);
00287     bool skippedSpace();
00288     bool skippedString(const XMLCh* const toSkip);
00289     bool peekString(const XMLCh* const toPeek);
00290 
00291 
00292     // -----------------------------------------------------------------------
00293     //  Getter methods
00294     // -----------------------------------------------------------------------
00295     unsigned int getColumnNumber() const;
00296     const XMLCh* getEncodingStr() const;
00297     unsigned int getLineNumber() const;
00298     bool getNoMoreFlag() const;
00299     const XMLCh* getPublicId() const;
00300     unsigned int getReaderNum() const;
00301     RefFrom getRefFrom() const;
00302     Sources getSource() const;
00303     unsigned int getSrcOffset() const;
00304     const XMLCh* getSystemId() const;
00305     bool getThrowAtEnd() const;
00306     Types getType() const;
00307     bool getURLEntityCaching() const;
00308 
00309 
00310     // -----------------------------------------------------------------------
00311     //  Setter methods
00312     // -----------------------------------------------------------------------
00313     bool setEncoding
00314     (
00315         const   XMLCh* const    newEncoding
00316     );
00317     void setReaderNum(const unsigned int newNum);
00318     void setThrowAtEnd(const bool newValue);
00319 
00320 
00321 private:
00322     // ---------------------------------------------------------------------------
00323     //  Class Constants
00324     //
00325     //  kCharBufSize
00326     //      The size of the character spool buffer that we use. Its not terribly
00327     //      large because its just getting filled with data from a raw byte
00328     //      buffer as we go along. We don't want to decode all the text at
00329     //      once before we find out that there is an error.
00330     //
00331     //      NOTE: This is a size in characters, not bytes.
00332     //
00333     //  kRawBufSize
00334     //      The size of the raw buffer from which raw bytes are spooled out
00335     //      as we transcode chunks of data. As it is emptied, it is filled back
00336     //      in again from the source stream.
00337     // ---------------------------------------------------------------------------
00338     enum Constants
00339     {
00340         kCharBufSize        = 16 * 1024
00341         , kRawBufSize       = 48 * 1024
00342     };
00343 
00344 
00345     // -----------------------------------------------------------------------
00346     //  Private static methods
00347     // -----------------------------------------------------------------------
00348     static bool checkTable
00349     (
00350         const   XMLCh* const    theTable
00351         , const XMLCh           toCheck
00352     );
00353 
00354 
00358     static void enableNELWS();
00359 
00360     // -----------------------------------------------------------------------
00361     //  Private helper methods
00362     // -----------------------------------------------------------------------
00363     void checkForSwapped();
00364 
00365     void doInitCharSizeChecks();
00366 
00367     void doInitDecode();
00368 
00369     XMLByte getNextRawByte
00370     (
00371         const   bool            eoiOk
00372     );
00373 
00374     void refreshRawBuffer();
00375 
00376     void writeToFile();
00377 
00378     void setTranscoder
00379     (
00380         const   XMLCh* const    newEncoding
00381     );
00382 
00383     unsigned int xcodeMoreChars
00384     (
00385                 XMLCh* const            bufToFill
00386         ,       unsigned char* const    charSizes
00387         , const unsigned int            maxChars
00388     );
00389 
00390 
00391     // -----------------------------------------------------------------------
00392     //  Data members
00393     //
00394     //  fCharIndex
00395     //      The index into the character buffer. When this hits fCharsAvail
00396     //      then its time to refill.
00397     //
00398     //  fCharBuf
00399     //      A buffer that the reader manager fills up with transcoded
00400     //      characters a small amount at a time.
00401     //
00402     //  fCharsAvail
00403     //      The characters currently available in the character buffer.
00404     //
00405     //  fCharSizeBuf
00406     //      This buffer is an array that contains the number of source chars
00407     //      eaten to create each char in the fCharBuf buffer. So the entry
00408     //      fCharSizeBuf[x] is the number of source chars that were eaten
00409     //      to make the internalized char fCharBuf[x]. This only contains
00410     //      useful data if fSrcOfsSupported is true.
00411     //
00412     //  fCurCol
00413     //  fCurLine
00414     //      The current line and column that we are in within this reader's
00415     //      text.
00416     //
00417     //  fEncoding
00418     //      This is the rough encoding setting. This enum is set during
00419     //      construction and just tells us the rough family of encoding that
00420     //      we are doing.
00421     //
00422     //  fEncodingStr
00423     //      This is the name of the encoding we are using. It will be
00424     //      provisionally set during construction, from the auto-sensed
00425     //      encoding. But it might be overridden when the XMLDecl is finally
00426     //      seen by the scanner. It can also be forced to a particular
00427     //      encoding, in which case fForcedEncoding is set.
00428     //
00429     //  fForcedEncoding
00430     //      If the encoding if forced then this is set and all other
00431     //      information will be ignored. This encoding will be taken as
00432     //      gospel. This is done by calling an alternate constructor.
00433     //
00434     //  fNoMore
00435     //      This is set when the source text is exhausted. It lets us know
00436     //      quickly that no more text is available.
00437     //
00438     //  fRawBufIndex
00439     //      The current index into the raw byte buffer. When its equal to
00440     //      fRawBytesAvail then we need to read another buffer.
00441     //
00442     //  fRawByteBuf
00443     //      This is the raw byte buffer that is used to spool out bytes
00444     //      from into the fCharBuf buffer, as we transcode in blocks.
00445     //
00446     //  fRawBytesAvail
00447     //      The number of bytes currently available in the raw buffer. This
00448     //      helps deal with the last buffer's worth, which will usually not
00449     //      be a full one.
00450     //
00451     //  fReaderNum
00452     //      Each reader from a particular reader manager (which means from a
00453     //      particular document) is given a unique number. The reader manager
00454     //      sets these numbers. They are used to catch things like partial
00455     //      markup errors.
00456     //
00457     //  fRefFrom
00458     //      This flag is provided in the ctor, and tells us if we represent
00459     //      some entity being expanded inside a literal. Sometimes things
00460     //      happen differently inside and outside literals.
00461     //
00462     //  fPublicId
00463     //  fSystemId
00464     //      These are the system and public ids of the source that this
00465     //      reader is reading.
00466     //
00467     //  fSentTrailingSpace
00468     //      If we are a PE entity being read and we not referenced from a
00469     //      literal, then a leading and trailing space must be faked into the
00470     //      data. This lets us know we've done the trailing space already (so
00471     //      we don't just keep doing it again and again.)
00472     //
00473     //  fSource
00474     //      Indicates whether the content this reader is spooling as already
00475     //      been internalized. This will prevent multiple processing of
00476     //      whitespace when an already internalized entity is being spooled
00477     //      out.
00478     //
00479     //  fSpareChar
00480     //      Some encodings can create two chars in an atomic way, e.g.
00481     //      surrogate pairs. We might not be able to store both, so we store
00482     //      it here until the next buffer transcoding operation.
00483     //
00484     //  fSrcOfsBase
00485     //      This is the base offset within the source of this entity. Values
00486     //      in the curent fCharSizeBuf array are relative to this value.
00487     //
00488     //  fSrcOfsSupported
00489     //      This flag is set to indicate whether source byte offset info
00490     //      is supported. For intrinsic encodings, its always set since we
00491     //      can always support it. For transcoder based encodings, we ask
00492     //      the transcoder if it supports it or not.
00493     //
00494     //  fStream
00495     //      This is the input stream that provides the data for the reader.
00496     //      Its always treated as a raw byte stream. The derived class will
00497     //      ask for buffers of text from it and will handle making some
00498     //      sense of it.
00499     //
00500     //  fSwapped
00501     //      If the encoding is one of the ones we do intrinsically, and its
00502     //      in a different byte order from our native order, then this is
00503     //      set to remind us to byte swap it during transcoding.
00504     //
00505     //  fThrowAtEnd
00506     //      Indicates whether the reader manager should throw an end of entity
00507     //      exception at the end of this reader instance. This is usually
00508     //      set for top level external entity references. It overrides the
00509     //      reader manager's global flag that controls throwing at the end
00510     //      of entities. Defaults to false.
00511     //
00512     //  fTranscoder
00513     //      If the encoding is not one that we handle intrinsically, then
00514     //      we use an an external transcoder to do it. This class is an
00515     //      abstraction that allows us to use pluggable external transcoding
00516     //      services (via XMLTransService in util.)
00517     //
00518     //  fType
00519     //      Indicates whether this reader represents a PE or not. If this
00520     //      flag is true and the fInLiteral flag is false, then we will put
00521     //      out an extra space at the end.
00522     //
00523     //  fURLEntityCaching
00524     //      Indicates whether the External entity files are to be cached to disk
00525     //      or not. The default is to cache it.
00526     //
00527     // -----------------------------------------------------------------------
00528     unsigned int                fCharIndex;
00529     XMLCh                       fCharBuf[kCharBufSize];
00530     unsigned int                fCharsAvail;
00531     unsigned char               fCharSizeBuf[kCharBufSize];
00532     unsigned int                fCurCol;
00533     unsigned int                fCurLine;
00534     XMLRecognizer::Encodings    fEncoding;
00535     XMLCh*                      fEncodingStr;
00536     bool                        fForcedEncoding;
00537     bool                        fNoMore;
00538     XMLCh*                      fPublicId;
00539     unsigned int                fRawBufIndex;
00540     XMLByte                     fRawByteBuf[kRawBufSize];
00541     unsigned int                fRawBytesAvail;
00542     unsigned int                fReaderNum;
00543     RefFrom                     fRefFrom;
00544     bool                        fSentTrailingSpace;
00545     Sources                     fSource;
00546     XMLCh                       fSpareCh;
00547     unsigned int                fSrcOfsBase;
00548     bool                        fSrcOfsSupported;
00549     XMLCh*                      fSystemId;
00550     BinInputStream*             fStream;
00551     bool                        fSwapped;
00552     bool                        fThrowAtEnd;
00553     XMLTranscoder*              fTranscoder;
00554     Types                       fType;
00555     bool                        fURLEntityCaching;
00556     char*                       fDiskCachePath;
00557     ofstream                    fDiskCacheFile;
00558 
00559 
00560     // -----------------------------------------------------------------------
00561     //  Static data members
00562     //
00563     //  fgCharCharsTable
00564     //      The character characteristics table. Bits in each byte, represent
00565     //      the characteristics of each character. It is generated via some
00566     //      code and then hard coded into the cpp file for speed.
00567     //
00568     //  fNEL
00569     //      Flag to respresents whether NEL whitespace recognition is enabled
00570     //      or disabled
00571     // -----------------------------------------------------------------------
00572     static XMLByte  fgCharCharsTable[0x10000];
00573     static bool     fNEL;
00574 
00575     friend class XMLPlatformUtils;
00576 };
00577 
00578 
00579 // ---------------------------------------------------------------------------
00580 //  XMLReader: Public, static methods
00581 // ---------------------------------------------------------------------------
00582 inline bool XMLReader::isBaseChar(const XMLCh toCheck)
00583 {
00584     return ((fgCharCharsTable[toCheck] & gBaseCharMask) != 0);
00585 }
00586 
00587 inline bool XMLReader::isNameChar(const XMLCh toCheck)
00588 {
00589     return ((fgCharCharsTable[toCheck] & gNameCharMask) != 0);
00590 }
00591 
00592 inline bool XMLReader::isPlainContentChar(const XMLCh toCheck)
00593 {
00594     return ((fgCharCharsTable[toCheck] & gPlainContentCharMask) != 0);
00595 }
00596 
00597 
00598 inline bool XMLReader::isSpecialCharDataChar(const XMLCh toCheck)
00599 {
00600     return ((fgCharCharsTable[toCheck] & gSpecialCharDataMask) != 0);
00601 }
00602 
00603 inline bool XMLReader::isSpecialStartTagChar(const XMLCh toCheck)
00604 {
00605     return ((fgCharCharsTable[toCheck] & gSpecialStartTagCharMask) != 0);
00606 }
00607 
00608 inline bool XMLReader::isXMLChar(const XMLCh toCheck)
00609 {
00610     return ((fgCharCharsTable[toCheck] & gXMLCharMask) != 0);
00611 }
00612 
00613 inline bool XMLReader::isXMLLetter(const XMLCh toCheck)
00614 {
00615     const XMLByte ourMask = gBaseCharMask | gLetterCharMask;
00616     return ((fgCharCharsTable[toCheck] & ourMask) != 0);
00617 }
00618 
00619 inline bool XMLReader::isWhitespace(const XMLCh toCheck)
00620 {
00621     return ((fgCharCharsTable[toCheck] & gWhitespaceCharMask) != 0);
00622 }
00623 
00624 // ---------------------------------------------------------------------------
00625 //  XMLReader: Buffer management methods
00626 // ---------------------------------------------------------------------------
00627 inline unsigned long XMLReader::charsLeftInBuffer() const
00628 {
00629     return fCharsAvail - fCharIndex;
00630 }
00631 
00632 
00633 // ---------------------------------------------------------------------------
00634 //  XMLReader: Getter methods
00635 // ---------------------------------------------------------------------------
00636 inline unsigned int XMLReader::getColumnNumber() const
00637 {
00638     return fCurCol;
00639 }
00640 
00641 inline const XMLCh* XMLReader::getEncodingStr() const
00642 {
00643     return fEncodingStr;
00644 }
00645 
00646 inline unsigned int XMLReader::getLineNumber() const
00647 {
00648     return fCurLine;
00649 }
00650 
00651 inline bool XMLReader::getNoMoreFlag() const
00652 {
00653     return fNoMore;
00654 }
00655 
00656 inline const XMLCh* XMLReader::getPublicId() const
00657 {
00658     return fPublicId;
00659 }
00660 
00661 inline unsigned int XMLReader::getReaderNum() const
00662 {
00663     return fReaderNum;
00664 }
00665 
00666 inline XMLReader::RefFrom XMLReader::getRefFrom() const
00667 {
00668     return fRefFrom;
00669 }
00670 
00671 inline XMLReader::Sources XMLReader::getSource() const
00672 {
00673     return fSource;
00674 }
00675 
00676 inline const XMLCh* XMLReader::getSystemId() const
00677 {
00678     return fSystemId;
00679 }
00680 
00681 inline bool XMLReader::getThrowAtEnd() const
00682 {
00683     return fThrowAtEnd;
00684 }
00685 
00686 inline XMLReader::Types XMLReader::getType() const
00687 {
00688     return fType;
00689 }
00690 
00691 inline bool XMLReader::isNELRecognized() {
00692 
00693     return fNEL;
00694 }
00695 
00696 inline bool XMLReader::getURLEntityCaching() const
00697 {
00698     return fURLEntityCaching;
00699 }
00700 
00701 //inline const XMLCh* XMLReader::getdiskCachePath() const
00702 //{
00703 //    return fDiskCachePath;
00704 //}
00705 
00706 // ---------------------------------------------------------------------------
00707 //  XMLReader: Setter methods
00708 // ---------------------------------------------------------------------------
00709 inline void XMLReader::setReaderNum(const unsigned int newNum)
00710 {
00711     fReaderNum = newNum;
00712 }
00713 
00714 inline void XMLReader::setThrowAtEnd(const bool newValue)
00715 {
00716     fThrowAtEnd = newValue;
00717 }
00718 
00719 
00720 
00721 // ---------------------------------------------------------------------------
00722 //
00723 //  XMLReader: movePlainContentChars()
00724 //
00725 //       Move as many plain (no special handling of any sort required) content
00726 //       characters as possible from this reader to the supplied destination buffer.
00727 //
00728 //       This is THE hottest performance spot in the parser.
00729 //
00730 // ---------------------------------------------------------------------------
00731 inline void XMLReader::movePlainContentChars(XMLBuffer &dest)
00732 {
00733     int count = 0;
00734     XMLCh *pStart = &fCharBuf[fCharIndex];
00735     XMLCh *pCurrent = pStart;
00736     XMLCh *pEnd     = &fCharBuf[fCharsAvail];
00737 
00738 
00739     while (pCurrent < pEnd)
00740     {
00741         if (! XMLReader::isPlainContentChar(*pCurrent++))
00742             break;
00743         count++;
00744     }
00745 
00746     if (count > 0)
00747     {
00748         fCharIndex += count;
00749         fCurCol    += count;
00750         dest.append(pStart, count);
00751     }
00752 }
00753 
00754 
00755 
00756 
00757 // ---------------------------------------------------------------------------
00758 //  XMLReader: getNextCharIfNot() method inlined for speed
00759 // ---------------------------------------------------------------------------
00760 inline bool XMLReader::getNextCharIfNot(const XMLCh chNotToGet, XMLCh& chGotten)
00761 {
00762     //
00763     //  See if there is at least a char in the buffer. Else, do the buffer
00764     //  reload logic.
00765     //
00766     if (fCharIndex < fCharsAvail)
00767     {
00768         // Check the next char
00769         if (fCharBuf[fCharIndex] == chNotToGet)
00770             return false;
00771 
00772         // Its not the one we want to skip so bump the index
00773         chGotten = fCharBuf[fCharIndex++];
00774     }
00775      else
00776     {
00777         // If fNoMore is set, then we have nothing else to give
00778         if (fNoMore)
00779             return false;
00780 
00781         // If the buffer is empty, then try to refresh
00782         if (fCharIndex == fCharsAvail)
00783         {
00784             if (!refreshCharBuffer())
00785             {
00786                 // If still empty, then return false
00787                 if (fCharIndex == fCharsAvail)
00788                     return false;
00789             }
00790         }
00791 
00792         // Check the next char
00793         if (fCharBuf[fCharIndex] == chNotToGet)
00794             return false;
00795 
00796         // Its not the one we want to skip so bump the index
00797         chGotten = fCharBuf[fCharIndex++];
00798     }
00799 
00800     // Handle end of line normalization and line/col member maintenance.
00801     if (chGotten == chCR)
00802     {
00803         //
00804         //  Do the normalization. We return chLF regardless of which was
00805         //  found. We also eat a chCR followed by an chLF.
00806         //
00807         //  We only do this if the content being spooled is not already
00808         //  internalized.
00809         //
00810         if (fSource == Source_External)
00811         {
00812             //
00813             //  See if we have another char left. If not, don't bother.
00814             //  Else, see if its an chLF to eat. If it is, bump the
00815             //  index again.
00816             //
00817             if (fCharIndex < fCharsAvail)
00818             {
00819                 if (fCharBuf[fCharIndex] == chLF
00820                     || ((fCharBuf[fCharIndex] == chNEL) && fNEL))
00821                     fCharIndex++;
00822             }
00823              else
00824             {
00825                 if (refreshCharBuffer())
00826                 {
00827                     if (fCharBuf[fCharIndex] == chLF
00828                         || ((fCharBuf[fCharIndex] == chNEL) && fNEL))
00829                         fCharIndex++;
00830                 }
00831             }
00832 
00833             // And return just an chLF
00834             chGotten = chLF;
00835         }
00836 
00837         // And handle the line/col stuff
00838         fCurCol = 1;
00839         fCurLine++;
00840     }
00841      else if (chGotten == chLF
00842               || ((chGotten == chNEL) && fNEL))
00843     {
00844         chGotten = chLF;
00845         fCurLine++;
00846         fCurCol = 1;
00847     }
00848      else if (chGotten)
00849     {
00850         //
00851         //  Only do this is not a null char. Null chars are not part of the
00852         //  real content. They are just marker characters inserted into
00853         //  the stream.
00854         //
00855         fCurCol++;
00856     }
00857     return true;
00858 }
00859 
00860 #endif

Generated on Tue Nov 19 09:36:35 2002 by doxygen1.3-rc1