|
|||||||||||
PREV CLASS NEXT CLASS | FRAMES NO FRAMES | ||||||||||
SUMMARY: NESTED | FIELD | CONSTR | METHOD | DETAIL: FIELD | CONSTR | METHOD |
java.lang.Objectorg.cyberneko.html.HTMLScanner
A simple HTML scanner. This scanner makes no attempt to balance tags or fix other problems in the source document — it just scans what it can and generates XNI document "events", ignoring errors of all kinds.
This component recognizes the following features:
This component recognizes the following properties:
HTMLElements
,
HTMLEntities
Nested Class Summary | |
class |
HTMLScanner.ContentScanner
The primary HTML document scanner. |
static class |
HTMLScanner.CurrentEntity
Current entity. |
protected static class |
HTMLScanner.LocationItem
Location infoset item. |
static class |
HTMLScanner.PlaybackInputStream
A playback input stream. |
static interface |
HTMLScanner.Scanner
Basic scanner interface. |
class |
HTMLScanner.SpecialScanner
Special scanner used for elements whose content needs to be scanned as plain text, ignoring markup such as elements and entity references. |
Field Summary | |
protected static String |
AUGMENTATIONS
Include infoset augmentations. |
static String |
CDATA_SECTIONS
Scan CDATA sections. |
protected static boolean |
DEBUG_CALLBACKS
Set to true to debug callbacks. |
protected static int |
DEFAULT_BUFFER_SIZE
Default buffer size. |
protected static String |
DEFAULT_ENCODING
Default encoding. |
protected static String |
DOCTYPE_PUBID
Doctype declaration public identifier. |
protected static String |
DOCTYPE_SYSID
Doctype declaration system identifier. |
protected static String |
ERROR_REPORTER
Error reporter. |
protected boolean |
fAugmentations
Augmentations. |
protected int |
fBeginColumnNumber
Beginning column number. |
protected int |
fBeginLineNumber
Beginning line number. |
protected HTMLScanner.PlaybackInputStream |
fByteStream
The playback byte stream. |
protected boolean |
fCDATASections
CDATA sections. |
protected HTMLScanner.Scanner |
fContentScanner
Content scanner. |
protected HTMLScanner.CurrentEntity |
fCurrentEntity
Current entity. |
protected Stack |
fCurrentEntityStack
The current entity stack. |
protected String |
fDefaultIANAEncoding
Default encoding. |
protected String |
fDoctypePubid
Doctype declaration public identifier. |
protected String |
fDoctypeSysid
Doctype declaration system identifier. |
protected XMLDocumentHandler |
fDocumentHandler
The document handler. |
protected int |
fElementCount
Element count. |
protected int |
fElementDepth
Element depth. |
protected int |
fEndColumnNumber
Ending column number. |
protected int |
fEndLineNumber
Ending line number. |
protected HTMLErrorReporter |
fErrorReporter
Error reporter. |
protected String |
fIANAEncoding
Auto-detected IANA encoding. |
protected boolean |
fIgnoreSpecifiedCharset
Ignore specified character set. |
protected boolean |
fInsertDoctype
Insert document type declaration. |
protected String |
fJavaEncoding
Auto-detected Java encoding. |
protected short |
fNamesAttrs
Modify HTML attribute names. |
protected short |
fNamesElems
Modify HTML element names. |
protected boolean |
fNotifyCharRefs
Notify character entity references. |
protected boolean |
fNotifyHtmlBuiltinRefs
Notify HTML built-in general entity references. |
protected boolean |
fNotifyXmlBuiltinRefs
Notify XML built-in general entity references. |
protected boolean |
fOverrideDoctype
Override doctype declaration public and system identifiers. |
protected boolean |
fReportErrors
Report errors. |
protected HTMLScanner.Scanner |
fScanner
The current scanner. |
protected short |
fScannerState
The current scanner state. |
protected boolean |
fScriptStripCommentDelims
Strip comment delimiters from SCRIPT tags. |
protected HTMLScanner.SpecialScanner |
fSpecialScanner
Special scanner used for elements whose content needs to be scanned as plain text, ignoring markup such as elements and entity references. |
protected XMLString |
fString
String. |
protected XMLStringBuffer |
fStringBuffer
String buffer. |
protected boolean |
fStyleStripCommentDelims
Strip comment delimiters from STYLE tags. |
static String |
HTML_4_01_FRAMESET_PUBID
HTML 4.01 frameset public identifier ("-//W3C//DTD HTML 4.01 Frameset//EN"). |
static String |
HTML_4_01_FRAMESET_SYSID
HTML 4.01 frameset system identifier ("http://www.w3.org/TR/html4/frameset.dtd"). |
static String |
HTML_4_01_STRICT_PUBID
HTML 4.01 strict public identifier ("-//W3C//DTD HTML 4.01//EN"). |
static String |
HTML_4_01_STRICT_SYSID
HTML 4.01 strict system identifier ("http://www.w3.org/TR/html4/strict.dtd"). |
static String |
HTML_4_01_TRANSITIONAL_PUBID
HTML 4.01 transitional public identifier ("-//W3C//DTD HTML 4.01 Transitional//EN"). |
static String |
HTML_4_01_TRANSITIONAL_SYSID
HTML 4.01 transitional system identifier ("http://www.w3.org/TR/html4/loose.dtd"). |
static String |
IGNORE_SPECIFIED_CHARSET
Ignore specified charset found in the <meta equiv='Content-Type' content='text/html;charset=…'> tag. |
static String |
INSERT_DOCTYPE
Insert document type declaration. |
protected static String |
NAMES_ATTRS
Modify HTML attribute names: { "upper", "lower", "default" }. |
protected static String |
NAMES_ELEMS
Modify HTML element names: { "upper", "lower", "default" }. |
protected static short |
NAMES_LOWERCASE
Lowercase HTML names. |
protected static short |
NAMES_NO_CHANGE
Don't modify HTML names. |
protected static short |
NAMES_UPPERCASE
Uppercase HTML names. |
static String |
NOTIFY_CHAR_REFS
Notify character entity references (e.g. |
static String |
NOTIFY_HTML_BUILTIN_REFS
Notify handler of built-in entity references (e.g. |
static String |
NOTIFY_XML_BUILTIN_REFS
Notify handler of built-in entity references (e.g. |
static String |
OVERRIDE_DOCTYPE
Override doctype declaration public and system identifiers. |
protected static String |
REPORT_ERRORS
Report errors. |
static String |
SCRIPT_STRIP_COMMENT_DELIMS
Strip HTML comment delimiters ("<!−−" and "−−>") from SCRIPT tag contents. |
protected static short |
STATE_CONTENT
State: content. |
protected static short |
STATE_END_DOCUMENT
State: end document. |
protected static short |
STATE_MARKUP_BRACKET
State: markup bracket. |
protected static short |
STATE_START_DOCUMENT
State: start document. |
static String |
STYLE_STRIP_COMMENT_DELIMS
Strip HTML comment delimiters ("<!−−" and "−−>") from STYLE tag contents. |
protected static HTMLEventInfo |
SYNTHESIZED_ITEM
Synthesized event info item. |
Constructor Summary | |
HTMLScanner()
|
Method Summary | |
protected static boolean |
builtinXmlRef(String name)
Returns true if the name is a built-in XML general entity reference. |
void |
cleanup(boolean closeall)
Cleans up used resources. |
static String |
expandSystemId(String systemId,
String baseSystemId)
Expands a system id and returns the system id as a URI, if it can be expanded. |
protected static String |
fixURI(String str)
Fixes a platform dependent filename to standard URI form. |
String |
getBaseSystemId()
Returns the base system identifier. |
int |
getColumnNumber()
Returns the current column number. |
XMLDocumentHandler |
getDocumentHandler()
Returns the document handler. |
String |
getEncoding()
Returns the encoding. |
String |
getExpandedSystemId()
Returns the expanded system identifier. |
Boolean |
getFeatureDefault(String featureId)
Returns the default state for a feature. |
int |
getLineNumber()
Returns the current line number. |
String |
getLiteralSystemId()
Returns the literal system identifier. |
protected static short |
getNamesValue(String value)
Converts HTML names string value to constant value. |
Object |
getPropertyDefault(String propertyId)
Returns the default state for a property. |
String |
getPublicId()
Returns the public identifier. |
String[] |
getRecognizedFeatures()
Returns recognized features. |
String[] |
getRecognizedProperties()
Returns recognized properties. |
protected static String |
getValue(XMLAttributes attrs,
String aname)
Returns the value of the specified attribute, ignoring case. |
protected int |
load(int offset)
Loads a new chunk of data into the buffer and returns the number of characters loaded or -1 if no additional characters were loaded. |
protected Augmentations |
locationAugs()
Returns an augmentations object with a location item added. |
protected static String |
modifyName(String name,
short mode)
Modifies the given name based on the specified mode. |
void |
pushInputSource(XMLInputSource inputSource)
Pushes an input source onto the current entity stack. |
protected int |
read()
Reads a single character. |
void |
reset(XMLComponentManager manager)
Resets the component. |
protected XMLResourceIdentifier |
resourceId()
Returns an empty resource identifier. |
protected void |
scanDoctype()
Scans a DOCTYPE line. |
boolean |
scanDocument(boolean complete)
Scans the document. |
protected int |
scanEntityRef(XMLStringBuffer str,
boolean content)
Scans an entity reference. |
protected String |
scanLiteral()
Scans a quoted literal. |
protected String |
scanName()
Scans a name. |
void |
setDocumentHandler(XMLDocumentHandler handler)
Sets the document handler. |
void |
setFeature(String featureId,
boolean state)
Sets a feature. |
void |
setInputSource(XMLInputSource source)
Sets the input source. |
void |
setProperty(String propertyId,
Object value)
Sets a property. |
protected void |
setScanner(HTMLScanner.Scanner scanner)
Sets the scanner. |
protected void |
setScannerState(short state)
Sets the scanner state. |
protected boolean |
skip(String s,
boolean caseSensitive)
Returns true if the specified text is present and is skipped. |
protected boolean |
skipMarkup(boolean balance)
Skips markup. |
protected int |
skipNewlines()
Skips newlines and returns the number of newlines skipped. |
protected int |
skipNewlines(int maxlines)
Skips newlines and returns the number of newlines skipped. |
protected boolean |
skipSpaces()
Skips whitespace. |
protected Augmentations |
synthesizedAugs()
Returns an augmentations object with a synthesized item added. |
Methods inherited from class java.lang.Object |
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait |
Field Detail |
public static final String HTML_4_01_STRICT_PUBID
public static final String HTML_4_01_STRICT_SYSID
public static final String HTML_4_01_TRANSITIONAL_PUBID
public static final String HTML_4_01_TRANSITIONAL_SYSID
public static final String HTML_4_01_FRAMESET_PUBID
public static final String HTML_4_01_FRAMESET_SYSID
protected static final String AUGMENTATIONS
protected static final String REPORT_ERRORS
public static final String NOTIFY_CHAR_REFS
public static final String NOTIFY_XML_BUILTIN_REFS
Note: This only applies to the five pre-defined XML general entities. Specifically, "amp", "lt", "gt", "quot", and "apos". This is done for compatibility with the Xerces feature.
To be notified of the built-in entity references in HTML, set the
http://cyberneko.org/html/features/scanner/notify-builtin-refs
feature to true
.
public static final String NOTIFY_HTML_BUILTIN_REFS
Note: This includes the five pre-defined XML general entities.
public static final String SCRIPT_STRIP_COMMENT_DELIMS
public static final String STYLE_STRIP_COMMENT_DELIMS
public static final String IGNORE_SPECIFIED_CHARSET
public static final String CDATA_SECTIONS
public static final String OVERRIDE_DOCTYPE
public static final String INSERT_DOCTYPE
protected static final String NAMES_ELEMS
protected static final String NAMES_ATTRS
protected static final String DEFAULT_ENCODING
protected static final String ERROR_REPORTER
protected static final String DOCTYPE_PUBID
protected static final String DOCTYPE_SYSID
protected static final short STATE_CONTENT
protected static final short STATE_MARKUP_BRACKET
protected static final short STATE_START_DOCUMENT
protected static final short STATE_END_DOCUMENT
protected static final short NAMES_NO_CHANGE
protected static final short NAMES_UPPERCASE
protected static final short NAMES_LOWERCASE
protected static final int DEFAULT_BUFFER_SIZE
protected static final boolean DEBUG_CALLBACKS
protected static final HTMLEventInfo SYNTHESIZED_ITEM
protected boolean fAugmentations
protected boolean fReportErrors
protected boolean fNotifyCharRefs
protected boolean fNotifyXmlBuiltinRefs
protected boolean fNotifyHtmlBuiltinRefs
protected boolean fScriptStripCommentDelims
protected boolean fStyleStripCommentDelims
protected boolean fIgnoreSpecifiedCharset
protected boolean fCDATASections
protected boolean fOverrideDoctype
protected boolean fInsertDoctype
protected short fNamesElems
protected short fNamesAttrs
protected String fDefaultIANAEncoding
protected HTMLErrorReporter fErrorReporter
protected String fDoctypePubid
protected String fDoctypeSysid
protected int fBeginLineNumber
protected int fBeginColumnNumber
protected int fEndLineNumber
protected int fEndColumnNumber
protected HTMLScanner.PlaybackInputStream fByteStream
protected HTMLScanner.CurrentEntity fCurrentEntity
protected final Stack fCurrentEntityStack
protected HTMLScanner.Scanner fScanner
protected short fScannerState
protected XMLDocumentHandler fDocumentHandler
protected String fIANAEncoding
protected String fJavaEncoding
protected int fElementCount
protected int fElementDepth
protected HTMLScanner.Scanner fContentScanner
protected HTMLScanner.SpecialScanner fSpecialScanner
protected final XMLString fString
protected final XMLStringBuffer fStringBuffer
Constructor Detail |
public HTMLScanner()
Method Detail |
public void pushInputSource(XMLInputSource inputSource)
Note: This functionality is experimental at this time and is subject to change in future releases of NekoHTML.
inputSource
- The new input source to start scanning.public void cleanup(boolean closeall)
closeall
- Close all streams, including the original.
This is used in cases when the application has
opened the original document stream and should
be responsible for closing it.public String getEncoding()
getEncoding
in interface XMLLocator
public String getPublicId()
getPublicId
in interface XMLLocator
public String getBaseSystemId()
getBaseSystemId
in interface XMLLocator
public String getLiteralSystemId()
getLiteralSystemId
in interface XMLLocator
public String getExpandedSystemId()
getExpandedSystemId
in interface XMLLocator
public int getLineNumber()
getLineNumber
in interface XMLLocator
public int getColumnNumber()
getColumnNumber
in interface XMLLocator
public Boolean getFeatureDefault(String featureId)
getFeatureDefault
in interface HTMLComponent
public Object getPropertyDefault(String propertyId)
getPropertyDefault
in interface HTMLComponent
public String[] getRecognizedFeatures()
getRecognizedFeatures
in interface XMLComponent
public String[] getRecognizedProperties()
getRecognizedProperties
in interface XMLComponent
public void reset(XMLComponentManager manager) throws XMLConfigurationException
reset
in interface XMLComponent
XMLConfigurationException
public void setFeature(String featureId, boolean state) throws XMLConfigurationException
setFeature
in interface XMLComponent
XMLConfigurationException
public void setProperty(String propertyId, Object value) throws XMLConfigurationException
setProperty
in interface XMLComponent
XMLConfigurationException
public void setInputSource(XMLInputSource source) throws IOException
setInputSource
in interface XMLDocumentScanner
IOException
public boolean scanDocument(boolean complete) throws XNIException, IOException
scanDocument
in interface XMLDocumentScanner
XNIException
IOException
public void setDocumentHandler(XMLDocumentHandler handler)
setDocumentHandler
in interface XMLDocumentSource
public XMLDocumentHandler getDocumentHandler()
getDocumentHandler
in interface XMLDocumentSource
protected static String getValue(XMLAttributes attrs, String aname)
public static String expandSystemId(String systemId, String baseSystemId)
systemId
- The systemId to be expanded.
protected static String fixURI(String str)
str
- The string to fix.
protected static final String modifyName(String name, short mode)
protected static final short getNamesValue(String value)
NAMES_NO_CHANGE
,
NAMES_LOWERCASE
,
NAMES_UPPERCASE
protected int read() throws IOException
IOException
protected int load(int offset) throws IOException
offset
- The offset at which new characters should be loaded.
IOException
protected void setScanner(HTMLScanner.Scanner scanner)
protected void setScannerState(short state)
protected void scanDoctype() throws IOException
IOException
protected String scanLiteral() throws IOException
IOException
protected String scanName() throws IOException
IOException
protected int scanEntityRef(XMLStringBuffer str, boolean content) throws IOException
IOException
protected boolean skip(String s, boolean caseSensitive) throws IOException
IOException
protected boolean skipMarkup(boolean balance) throws IOException
IOException
protected boolean skipSpaces() throws IOException
IOException
protected int skipNewlines() throws IOException
IOException
protected int skipNewlines(int maxlines) throws IOException
IOException
protected final Augmentations locationAugs()
protected final Augmentations synthesizedAugs()
protected final XMLResourceIdentifier resourceId()
protected static boolean builtinXmlRef(String name)
|
|||||||||||
PREV CLASS NEXT CLASS | FRAMES NO FRAMES | ||||||||||
SUMMARY: NESTED | FIELD | CONSTR | METHOD | DETAIL: FIELD | CONSTR | METHOD |