Overview
The new JavaScript API v2
is designed to simplify work by abandoning the use of generators in favor of async/await
. Also, some limitations of the previous version have been eliminated in this API, support for TypeScript has been added, and performance has been improved. We recommend using this JavaScript API for creating all new scrapers
To use JavaScript API v2
, it is sufficient to inherit your scraper class from the base BaseParser
. Let's examine the structure of a scraper class using an example:
- TypeScript
- JavaScript
import { BaseParser } from 'a-parser-types';
export class JS_v2_example extends BaseParser {
static defaultConf: typeof BaseParser.defaultConf = {
version: '0.0.1',
results: {
flat: [
['title', 'Title'],
['h1', 'H1 Header']
],
arrays: {
h2: ['H2 Headers List', [
['header', 'Header'],
]],
}
},
max_size: 2 * 1024 * 1024,
parsecodes: {
200: 1,
},
results_format: "Title: $title\nH1: $h1\nH2 headers:\n$h2.format('$header\\n')\n",
limitH2Tags: 3,
};
static editableConf: typeof BaseParser.editableConf = [
['limitH2Tags', ['textfield', 'Limit H2 tags']],
];
async parse(set, results) {
const { success, data, headers } = await this.request('GET', set.query);
if (success && typeof data == 'string') {
let matches;
if (matches = data.match(/<title[^>]*>(.*?)<\/title>/))
results.title = matches[1];
if (matches = data.match(/<h1[^>]*>(.*?)<\/h1>/))
results.h1 = matches[1];
if (results.h2) {
let count = 0;
const re = /<h2[^>]*>(.*?)<\/h2>/g;
while(matches = re.exec(data)) {
results.h2.push(matches[1]);
if (++count == this.conf.limitH2Tags)
break;
}
}
}
return results;
}
}
const { BaseParser } = require("a-parser-types");
class JS_v2_example_js extends BaseParser {
static defaultConf = {
version: '0.0.1',
results: {
flat: [
['title', 'Title'],
['h1', 'H1 Header']
],
arrays: {
h2: ['H2 Headers List', [
['header', 'Header'],
]],
}
},
max_size: 2 * 1024 * 1024,
parsecodes: {
200: 1,
},
results_format: "Title: $title\nH1: $h1\nH2 headers:\n$h2.format('$header\\n')\n",
limitH2Tags: 3,
};
static editableConf = [
['limitH2Tags', ['textfield', 'Limit H2 tags']],
];
async parse(set, results) {
const { success, data, headers } = await this.request('GET', set.query);
if (success && typeof data == 'string') {
let matches;
if (matches = data.match(/<title[^>]*>(.*?)<\/title>/))
results.title = matches[1];
if (matches = data.match(/<h1[^>]*>(.*?)<\/h1>/))
results.h1 = matches[1];
if (results.h2) {
let count = 0;
const re = /<h2[^>]*>(.*?)<\/h2>/g;
while(matches = re.exec(data)) {
results.h2.push(matches[1]);
if (++count == this.conf.limitH2Tags)
break;
}
}
}
return results;
}
}
TODO: (next) ## Inheritance
Useful links
🔗 Example of saving a file to disk
An example demonstrating how to save files directly to disk
🔗 Example of working with sessions
Using session functionality in JavaScript scrapers
🔗 Example of saving data in a session
Demonstration of the ability to store arbitrary data in a session
🔗 Using results.addElement()
An example of filling an array of data using results.addElement() and demonstrating the difference from the usual .push()