Overview
The new JavaScript API v2
is designed to simplify work by abandoning generators in favor of async/await
. Additionally, some limitations of the previous version have been eliminated, TypeScript support has been added, and performance has been improved. We recommend using this JavaScript API for creating all new scrapers.
To use JavaScript API v2
, it is sufficient to inherit your scraper class from the base BaseParser
class. Let's take a look at the structure of the scraper class using an example:
- TypeScript
- JavaScript
import { BaseParser } from 'a-parser-types';
export class JS_v2_example extends BaseParser {
static defaultConf: typeof BaseParser.defaultConf = {
version: '0.0.1',
results: {
flat: [
['title', 'Title'],
['h1', 'H1 Header']
],
arrays: {
h2: ['H2 Headers List', [
['header', 'Header'],
]],
}
},
max_size: 2 * 1024 * 1024,
parsecodes: {
200: 1,
},
results_format: "Title: $title\nH1: $h1\nH2 headers:\n$h2.format('$header\\n')\n",
limitH2Tags: 3,
};
static editableConf: typeof BaseParser.editableConf = [
['limitH2Tags', ['textfield', 'Limit H2 tags']],
];
async parse(set, results) {
const {success, data, headers} = await this.request('GET', set.query);
if(success && typeof data == 'string') {
let matches;
if(matches = data.match(/<title[^>]*>(.*?)<\/title>/))
results.title = matches[1];
if(matches = data.match(/<h1[^>]*>(.*?)<\/h1>/))
results.h1 = matches[1];
if(results.h2) {
let count = 0;
const re = /<h2[^>]*>(.*?)<\/h2>/g;
while(matches = re.exec(data)) {
results.h2.push(matches[1]);
if(++count == this.conf.limitH2Tags)
break;
}
}
}
return results;
}
}
const { BaseParser } = require("a-parser-types");
class JS_v2_example_js extends BaseParser {
static defaultConf = {
version: '0.0.1',
results: {
flat: [
['title', 'Title'],
['h1', 'H1 Header']
],
arrays: {
h2: ['H2 Headers List', [
['header', 'Header'],
]],
}
},
max_size: 2 * 1024 * 1024,
parsecodes: {
200: 1,
},
results_format: "Title: $title\nH1: $h1\nH2 headers:\n$h2.format('$header\\n')\n",
limitH2Tags: 3,
};
static editableConf = [
['limitH2Tags', ['textfield', 'Limit H2 tags']],
];
async parse(set, results) {
const {success, data, headers} = await this.request('GET', set.query);
if(success && typeof data == 'string') {
let matches;
if(matches = data.match(/<title[^>]*>(.*?)<\/title>/))
results.title = matches[1];
if(matches = data.match(/<h1[^>]*>(.*?)<\/h1>/))
results.h1 = matches[1];
if(results.h2) {
let count = 0;
const re = /<h2[^>]*>(.*?)<\/h2>/g;
while(matches = re.exec(data)) {
results.h2.push(matches[1]);
if(++count == this.conf.limitH2Tags)
break;
}
}
}
return results;
}
}
TODO: (next) ## Inheritance
Useful links
Example of saving a file to disk
An example demonstrating how to save files directly to disk
Example of working with sessions
Using session functionality in JavaScript scrapers
Example of saving data in a session
Demonstration of the ability to store arbitrary data in a session
Using results.addElement()
An example of filling an array of data using results.addElement() and demonstrating the difference from the usual .push()