192 lines
6 KiB
JavaScript
192 lines
6 KiB
JavaScript
|
import { Transform } from 'node:stream';
|
||
|
import { DevNullStream } from './dev-null-stream.js';
|
||
|
import { ParserFeedbackSimulator } from './parser-feedback-simulator.js';
|
||
|
/**
|
||
|
* Streaming [SAX](https://en.wikipedia.org/wiki/Simple_API_for_XML)-style HTML parser.
|
||
|
* A [transform stream](https://nodejs.org/api/stream.html#stream_class_stream_transform) (which means you can pipe _through_ it, see example).
|
||
|
*
|
||
|
* @example
|
||
|
*
|
||
|
* ```js
|
||
|
* const SAXParser = require('parse5-sax-parser');
|
||
|
* const http = require('http');
|
||
|
* const fs = require('fs');
|
||
|
*
|
||
|
* const file = fs.createWriteStream('/home/google.com.html');
|
||
|
* const parser = new SAXParser();
|
||
|
*
|
||
|
* parser.on('text', text => {
|
||
|
* // Handle page text content
|
||
|
* ...
|
||
|
* });
|
||
|
*
|
||
|
* http.get('http://google.com', res => {
|
||
|
* // `SAXParser` is the `Transform` stream, which means you can pipe
|
||
|
* // through it. So, you can analyze the page content and, e.g., save it
|
||
|
* // to the file at the same time:
|
||
|
* res.pipe(parser).pipe(file);
|
||
|
* });
|
||
|
* ```
|
||
|
*/
|
||
|
export class SAXParser extends Transform {
|
||
|
/**
|
||
|
* @param options Parsing options.
|
||
|
*/
|
||
|
constructor(options = {}) {
|
||
|
super({ encoding: 'utf8', decodeStrings: false });
|
||
|
this.pendingText = null;
|
||
|
this.lastChunkWritten = false;
|
||
|
this.stopped = false;
|
||
|
this.options = {
|
||
|
sourceCodeLocationInfo: false,
|
||
|
...options,
|
||
|
};
|
||
|
this.parserFeedbackSimulator = new ParserFeedbackSimulator(this.options, this);
|
||
|
this.tokenizer = this.parserFeedbackSimulator.tokenizer;
|
||
|
// NOTE: always pipe the stream to the /dev/null stream to avoid
|
||
|
// the `highWaterMark` to be hit even if we don't have consumers.
|
||
|
// (see: https://github.com/inikulin/parse5/issues/97#issuecomment-171940774)
|
||
|
this.pipe(new DevNullStream());
|
||
|
}
|
||
|
//`Transform` implementation
|
||
|
_transform(chunk, _encoding, callback) {
|
||
|
if (typeof chunk !== 'string') {
|
||
|
throw new TypeError('Parser can work only with string streams.');
|
||
|
}
|
||
|
callback(null, this._transformChunk(chunk));
|
||
|
}
|
||
|
_final(callback) {
|
||
|
this.lastChunkWritten = true;
|
||
|
callback(null, this._transformChunk(''));
|
||
|
}
|
||
|
/**
|
||
|
* Stops parsing. Useful if you want the parser to stop consuming CPU time
|
||
|
* once you've obtained the desired info from the input stream. Doesn't
|
||
|
* prevent piping, so that data will flow through the parser as usual.
|
||
|
*
|
||
|
* @example
|
||
|
*
|
||
|
* ```js
|
||
|
* const SAXParser = require('parse5-sax-parser');
|
||
|
* const http = require('http');
|
||
|
* const fs = require('fs');
|
||
|
*
|
||
|
* const file = fs.createWriteStream('google.com.html');
|
||
|
* const parser = new SAXParser();
|
||
|
*
|
||
|
* parser.on('doctype', ({ name, publicId, systemId }) => {
|
||
|
* // Process doctype info and stop parsing
|
||
|
* ...
|
||
|
* parser.stop();
|
||
|
* });
|
||
|
*
|
||
|
* http.get('http://google.com', res => {
|
||
|
* // Despite the fact that parser.stop() was called whole
|
||
|
* // content of the page will be written to the file
|
||
|
* res.pipe(parser).pipe(file);
|
||
|
* });
|
||
|
* ```
|
||
|
*/
|
||
|
stop() {
|
||
|
this.stopped = true;
|
||
|
this.tokenizer.pause();
|
||
|
}
|
||
|
//Internals
|
||
|
_transformChunk(chunk) {
|
||
|
if (!this.stopped) {
|
||
|
this.tokenizer.write(chunk, this.lastChunkWritten);
|
||
|
}
|
||
|
return chunk;
|
||
|
}
|
||
|
/** @internal */
|
||
|
onCharacter({ chars, location }) {
|
||
|
if (this.pendingText === null) {
|
||
|
this.pendingText = { text: chars, sourceCodeLocation: location };
|
||
|
}
|
||
|
else {
|
||
|
this.pendingText.text += chars;
|
||
|
if (location && this.pendingText.sourceCodeLocation) {
|
||
|
const { endLine, endCol, endOffset } = location;
|
||
|
this.pendingText.sourceCodeLocation = {
|
||
|
...this.pendingText.sourceCodeLocation,
|
||
|
endLine,
|
||
|
endCol,
|
||
|
endOffset,
|
||
|
};
|
||
|
}
|
||
|
}
|
||
|
if (this.tokenizer.preprocessor.willDropParsedChunk()) {
|
||
|
this._emitPendingText();
|
||
|
}
|
||
|
}
|
||
|
/** @internal */
|
||
|
onWhitespaceCharacter(token) {
|
||
|
this.onCharacter(token);
|
||
|
}
|
||
|
/** @internal */
|
||
|
onNullCharacter(token) {
|
||
|
this.onCharacter(token);
|
||
|
}
|
||
|
/** @internal */
|
||
|
onEof() {
|
||
|
this._emitPendingText();
|
||
|
this.stopped = true;
|
||
|
}
|
||
|
/** @internal */
|
||
|
onStartTag(token) {
|
||
|
this._emitPendingText();
|
||
|
const startTag = {
|
||
|
tagName: token.tagName,
|
||
|
attrs: token.attrs,
|
||
|
selfClosing: token.selfClosing,
|
||
|
sourceCodeLocation: token.location,
|
||
|
};
|
||
|
this.emitIfListenerExists('startTag', startTag);
|
||
|
}
|
||
|
/** @internal */
|
||
|
onEndTag(token) {
|
||
|
this._emitPendingText();
|
||
|
const endTag = {
|
||
|
tagName: token.tagName,
|
||
|
sourceCodeLocation: token.location,
|
||
|
};
|
||
|
this.emitIfListenerExists('endTag', endTag);
|
||
|
}
|
||
|
/** @internal */
|
||
|
onDoctype(token) {
|
||
|
this._emitPendingText();
|
||
|
const doctype = {
|
||
|
name: token.name,
|
||
|
publicId: token.publicId,
|
||
|
systemId: token.systemId,
|
||
|
sourceCodeLocation: token.location,
|
||
|
};
|
||
|
this.emitIfListenerExists('doctype', doctype);
|
||
|
}
|
||
|
/** @internal */
|
||
|
onComment(token) {
|
||
|
this._emitPendingText();
|
||
|
const comment = {
|
||
|
text: token.data,
|
||
|
sourceCodeLocation: token.location,
|
||
|
};
|
||
|
this.emitIfListenerExists('comment', comment);
|
||
|
}
|
||
|
emitIfListenerExists(eventName, token) {
|
||
|
if (this.listenerCount(eventName) === 0) {
|
||
|
return false;
|
||
|
}
|
||
|
this._emitToken(eventName, token);
|
||
|
return true;
|
||
|
}
|
||
|
_emitToken(eventName, token) {
|
||
|
this.emit(eventName, token);
|
||
|
}
|
||
|
_emitPendingText() {
|
||
|
if (this.pendingText !== null) {
|
||
|
this.emitIfListenerExists('text', this.pendingText);
|
||
|
this.pendingText = null;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
//# sourceMappingURL=index.js.map
|