用javascript在nodejs环境下读取超大的Stackoverflow XML文件
本文旨在为nodejs环境下读取超大XML文件提供一种解决方案。
编程语言: javascript
平台:nodejs
测试文件大小: 73G
程序包:
"node-xml-stream": "^1.0.2"
代码示例:
const loadXml = () => {
try {
let Parser = require("node-xml-stream");
let fs = require("fs");
let parser = new Parser();
// <tag attr="hello">
parser.on("opentag", (name, attrs) => {
// name = 'tag'
// attrs = { attr: 'hello' }
console.log(name, attrs);
});
// </tag>
parser.on("closetag", name => {
// name = 'tag'
});
// <tag>TEXT</tag>
parser.on("text", text => {
// text = 'TEXT'
});
// <[[CDATA['data']]>
parser.on("cdata", cdata => {
// cdata = 'data'
});
// <?xml version="1.0"?>
parser.on("instruction", (name, attrs) => {
// name = 'xml'
// attrs = { version: '1.0' }
});
// Only stream-errors are emitted.
parser.on("error", err => {
// Handle a parsing error
});
parser.on("finish", () => {
// Stream is completed
});
// Write data to the stream.
parser.write("<root>TEXT</root>");
// Pipe a stream to the parser
let stream = fs.createReadStream("D:/data/stackoverflow/stackoverflow.com-Posts/PostsCopy.xml");
stream.pipe(parser);
} catch (e) {
console.log(e);
}
};
运行结果片段:
row {
Id: '451',
PostTypeId: '2',
ParentId: '371',
CreationDate: '2008-08-02T13:45:57.197',
Score: '13',
Body: '<p>Yahoo uses a method called Sender ID, which can be configured at <a',
href: '"http://old.openspf.org/wizard.html?mydomain',
rel: '"nofollow noreferrer">The SPF Setup Wizard</a> and entered in to your DNS. Also one of the important ones for Exchange, Hotmail, AOL, Yahoo, and others is to have a Reverse DNS for your domain. Those will knock out most of the issues.
However you can never prevent a person intentionally blocking your or custom rules.</p>
',
OwnerUserId: '17',
LastEditorUserId: '246246',
LastEditDate: '2017-04-20T16:17:40.470',
LastActivityDate: '2017-04-20T16:17:40.470',
CommentCount: '1 /'
}
row {
Id: '467',
PostTypeId: '2',
ParentId: '17',
CreationDate: '2008-08-02T14:57:13.043',
Score: '22',
Body: '<p>While you havent said what youre storing, and you may have a great reason for doing so, often the answer is as a filesystem reference and the actual data is on the filesystem somewhere.</p>

<p><a',
href: '"http://www.onlamp.com/pub/a/onlamp/2002/07/11/MySQLtips.html"',
rel: '"noreferrer">http://www.onlamp.com/pub/a/onlamp/2002/07/11/MySQLtips.html</a></p>
',
OwnerUserId: '144',
LastActivityDate: '2008-08-02T14:57:13.043',
CommentCount: '0 /'
}
- 点赞
- 收藏
- 关注作者
评论(0)