improvements
* chance to continue on mismatch url for redirectTab * support empty field selectors * add Extractor.results() * add ExtractResult.walk(), ExtractResult.visit() * add ! directive to click elements * code optimize
This commit is contained in:
34
readme.md
34
readme.md
@ -17,6 +17,8 @@ Extract current page
|
|||||||
```js
|
```js
|
||||||
$('.item', ['a', 'a@href']);
|
$('.item', ['a', 'a@href']);
|
||||||
new Extractor().task('.item', ['a', 'a@href']).start();
|
new Extractor().task('.item', ['a', 'a@href']).start();
|
||||||
|
// fieldSelectors can be empty strings if items have no child to select
|
||||||
|
new Extractor().task('.item a', ['', '@href']).start();
|
||||||
```
|
```
|
||||||
|
|
||||||
> `$(...args)` is the short form of `new Extractor().task(...args).start();`, which is introduced later.
|
> `$(...args)` is the short form of `new Extractor().task(...args).start();`, which is introduced later.
|
||||||
@ -67,7 +69,9 @@ job = new Extractor().task('.search-list-item', ['a@href'], ["http://sample.com/
|
|||||||
job.stop();
|
job.stop();
|
||||||
```
|
```
|
||||||
|
|
||||||
## Extract Attributes.
|
> Next time you call `job.start();`, the job will continues from where it stopped.
|
||||||
|
|
||||||
|
## Extract Attributes
|
||||||
|
|
||||||
e.g.: link text and target (use 'selector@attribute')
|
e.g.: link text and target (use 'selector@attribute')
|
||||||
|
|
||||||
@ -75,6 +79,14 @@ e.g.: link text and target (use 'selector@attribute')
|
|||||||
new Extractor().task('.item', ['a', 'a@href']).start();
|
new Extractor().task('.item', ['a', 'a@href']).start();
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## Click Selected Elements
|
||||||
|
|
||||||
|
The following clicks selected links and extracts link `text` and `href`
|
||||||
|
|
||||||
|
```js
|
||||||
|
new Extractor().task('.item', ['!a', 'a@href']).start();
|
||||||
|
```
|
||||||
|
|
||||||
## Advanced Usage
|
## Advanced Usage
|
||||||
|
|
||||||
### Use Task Chain.
|
### Use Task Chain.
|
||||||
@ -202,6 +214,26 @@ To stop watching, you can either `close current window`, or:
|
|||||||
e.stop();
|
e.stop();
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## Results Operation
|
||||||
|
|
||||||
|
To get the results of a task:
|
||||||
|
|
||||||
|
```js
|
||||||
|
let results = job.results(0);
|
||||||
|
```
|
||||||
|
|
||||||
|
Visit URLs (if any) in the results one by one:
|
||||||
|
|
||||||
|
```js
|
||||||
|
results.visit();
|
||||||
|
```
|
||||||
|
|
||||||
|
Walk through all results one by one:
|
||||||
|
|
||||||
|
```js
|
||||||
|
results.walk((row,col,value)=>{console.log(value)});
|
||||||
|
```
|
||||||
|
|
||||||
## Developpment
|
## Developpment
|
||||||
|
|
||||||
Clone this project and execute:
|
Clone this project and execute:
|
||||||
|
|||||||
@ -8,14 +8,14 @@ import { logger } from "../common/logger";
|
|||||||
* @param {string} url target URL
|
* @param {string} url target URL
|
||||||
* @returns {Promise<string[]>} a promise of target URL
|
* @returns {Promise<string[]>} a promise of target URL
|
||||||
*/
|
*/
|
||||||
export function redirectTab(tab: chrome.tabs.Tab, url: string) {
|
export function redirectTab(tab: chrome.tabs.Tab, url: string, check?: boolean) {
|
||||||
return queryUrl(tab).then(u => {
|
return queryUrl(tab).then(u => {
|
||||||
if (url !== u) {
|
if (url !== u) {
|
||||||
let req: Request = {
|
let req: Request = {
|
||||||
action: Actions.GOTO_URL,
|
action: Actions.GOTO_URL,
|
||||||
url: url
|
url: url
|
||||||
}
|
}
|
||||||
let checker: ResponseChecker<string> = async (r, err, tryCount): Promise<string> => {
|
let checker: ResponseChecker<string> = !check ? undefined : async (r, err, tryCount): Promise<string> => {
|
||||||
let queryErr: any;
|
let queryErr: any;
|
||||||
let newURL = await queryUrl(tab).catch(e => queryErr = e);
|
let newURL = await queryUrl(tab).catch(e => queryErr = e);
|
||||||
if (queryErr) {
|
if (queryErr) {
|
||||||
@ -23,10 +23,12 @@ export function redirectTab(tab: chrome.tabs.Tab, url: string) {
|
|||||||
}
|
}
|
||||||
if (newURL == url) return url;
|
if (newURL == url) return url;
|
||||||
if (
|
if (
|
||||||
tryCount % 1 == 0 &&
|
confirm(`Cannot navigate to target url.
|
||||||
!confirm('Cannot navigate to target url. \nPress OK to continue, Cancel to stop.')
|
expected: ${url}\n
|
||||||
|
actual: ${newURL}\n
|
||||||
|
Press OK to continue, Cancel to retry. Close the tab to stop`)
|
||||||
) {
|
) {
|
||||||
throw "Tasks stopped by user.";
|
return newURL;
|
||||||
}
|
}
|
||||||
return undefined;
|
return undefined;
|
||||||
}
|
}
|
||||||
@ -111,7 +113,7 @@ export function scrollToBottom(tab: chrome.tabs.Tab) {
|
|||||||
return sendMessage(tab, req, 'Scroll to page bottom...');
|
return sendMessage(tab, req, 'Scroll to page bottom...');
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function createTab(url: string, active: boolean) {
|
export async function createTab(url: string, active: boolean): Promise<chrome.tabs.Tab> {
|
||||||
return new Promise((resolve, reject) => {
|
return new Promise((resolve, reject) => {
|
||||||
findIncognitoWindow().then(
|
findIncognitoWindow().then(
|
||||||
incognitoWindow => {
|
incognitoWindow => {
|
||||||
@ -179,7 +181,7 @@ export async function CreateIncognitoWindow() {
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function getActiveTab(currentWindow: boolean) {
|
export async function getActiveTab(currentWindow: boolean): Promise<chrome.tabs.Tab> {
|
||||||
return new Promise((resolve, reject) => {
|
return new Promise((resolve, reject) => {
|
||||||
chrome.tabs.query({
|
chrome.tabs.query({
|
||||||
active: true,
|
active: true,
|
||||||
@ -190,7 +192,7 @@ export async function getActiveTab(currentWindow: boolean) {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function getTabByID(id: number) {
|
export async function getTabByID(id: number): Promise<chrome.tabs.Tab> {
|
||||||
return new Promise((resolve, reject) => {
|
return new Promise((resolve, reject) => {
|
||||||
chrome.tabs.get(id, function (tab) {
|
chrome.tabs.get(id, function (tab) {
|
||||||
chrome.runtime.lastError;
|
chrome.runtime.lastError;
|
||||||
|
|||||||
@ -1,9 +1,9 @@
|
|||||||
import { Task } from "./task";
|
import { Task } from "./task";
|
||||||
import { saveFile } from "./tools";
|
import { parseUrls, saveFile } from "./tools";
|
||||||
import { createTab, getActiveTab, ping } from "./actions";
|
import { createTab, getActiveTab, ping, redirectTab } from "./actions";
|
||||||
import { ExtractResult } from "./result";
|
|
||||||
import { logger } from "../common/logger";
|
import { logger } from "../common/logger";
|
||||||
import { caches } from "./caches";
|
import { caches } from "./caches";
|
||||||
|
import { ExtractResult } from "./result";
|
||||||
|
|
||||||
export class Extractor {
|
export class Extractor {
|
||||||
private _tasks: Task[] = [];
|
private _tasks: Task[] = [];
|
||||||
@ -50,6 +50,17 @@ export class Extractor {
|
|||||||
this._tasks.push(new Task(this._options, ...args));
|
this._tasks.push(new Task(this._options, ...args));
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
/**
|
||||||
|
* Add a task to Extractor. \n
|
||||||
|
* One Extractor could has multiple tasks, which orgnized in a task chian.
|
||||||
|
* If url arguments not given within later tasks, they will use previous task result as input (target url list).
|
||||||
|
* @param {...any} args itemsSelector, fieldSelectors, and more args to specify target urls.
|
||||||
|
*/
|
||||||
|
results(id?: number): ExtractResult {
|
||||||
|
id = this._checkTaskId(id);
|
||||||
|
if (id < 0) return;
|
||||||
|
return this._tasks[id].results;
|
||||||
|
}
|
||||||
/**
|
/**
|
||||||
* Clear tasks and task caches.
|
* Clear tasks and task caches.
|
||||||
*/
|
*/
|
||||||
@ -121,7 +132,7 @@ export class Extractor {
|
|||||||
if (i < from) return;
|
if (i < from) return;
|
||||||
if (i > 0) {
|
if (i > 0) {
|
||||||
let prevTask = this._tasks[i - 1];
|
let prevTask = this._tasks[i - 1];
|
||||||
return task.execute(tab, new ExtractResult(prevTask.results));
|
return task.execute(tab, prevTask.results);
|
||||||
}
|
}
|
||||||
return task.execute(tab);
|
return task.execute(tab);
|
||||||
});
|
});
|
||||||
@ -143,22 +154,22 @@ export class Extractor {
|
|||||||
let id = this._checkTaskId(taskid, this._tasks.length - 1);
|
let id = this._checkTaskId(taskid, this._tasks.length - 1);
|
||||||
if (id < 0) return;
|
if (id < 0) return;
|
||||||
let results = this._tasks[id].results
|
let results = this._tasks[id].results
|
||||||
if (!results.length) {
|
let count = results.data.length
|
||||||
|
if (!count) {
|
||||||
logger.info(`No result for task #${id}. Forget to call ".start()"?`);
|
logger.info(`No result for task #${id}. Forget to call ".start()"?`);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
results.unshift(this._tasks[id].fieldSelectors);
|
results.header = this._tasks[id].fieldSelectors;
|
||||||
let exResults = new ExtractResult(results);
|
|
||||||
let msg = `
|
let msg = `
|
||||||
Please confirm to download (${results.length - 1} items):
|
Please confirm to download (${count} items):
|
||||||
|
|
||||||
${exResults.toString(50) || "- Empty -"}
|
${results.toString(50) || "- Empty -"}
|
||||||
`.trim();
|
`.trim();
|
||||||
if (confirm(msg)) {
|
if (confirm(msg)) {
|
||||||
saveFile(exResults.toString(), "text/csv");
|
saveFile(results.toString(), "text/csv");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
_checkTaskId(id: number, defaultId?: number) {
|
private _checkTaskId(id: number, defaultId?: number) {
|
||||||
if (!this._tasks.length) {
|
if (!this._tasks.length) {
|
||||||
logger.info("No task found.");
|
logger.info("No task found.");
|
||||||
return -1;
|
return -1;
|
||||||
|
|||||||
@ -1,6 +1,11 @@
|
|||||||
|
import { logger } from "../common/logger";
|
||||||
|
import { getActiveTab, ping, redirectTab } from "./actions";
|
||||||
|
import { parseUrls } from "./tools";
|
||||||
|
|
||||||
export class ExtractResult {
|
export class ExtractResult {
|
||||||
|
private _header: string[];
|
||||||
private _data: string[][] = [];
|
private _data: string[][] = [];
|
||||||
constructor(data) {
|
constructor(data: string[][]) {
|
||||||
this._data = data || [];
|
this._data = data || [];
|
||||||
}
|
}
|
||||||
row(index: number): string[] {
|
row(index: number): string[] {
|
||||||
@ -14,11 +19,17 @@ export class ExtractResult {
|
|||||||
squash(): string[] {
|
squash(): string[] {
|
||||||
return this._data.reduce((p, c) => p.concat(c), []);
|
return this._data.reduce((p, c) => p.concat(c), []);
|
||||||
}
|
}
|
||||||
|
set header(h: string[]) {
|
||||||
|
this._header = h
|
||||||
|
}
|
||||||
get data(): string[][] {
|
get data(): string[][] {
|
||||||
return this._data;
|
return this._data;
|
||||||
}
|
}
|
||||||
toString(rowsCount: number = 0): string {
|
toString(rowsCount: number = 0): string {
|
||||||
let data = rowsCount > 0 ? this._data.slice(0, rowsCount) : this._data;
|
let data = rowsCount > 0 ? this._data.slice(0, rowsCount) : this._data;
|
||||||
|
if (this._header && this._header.length) {
|
||||||
|
data.unshift(this._header);
|
||||||
|
}
|
||||||
return data.slice().reduce(
|
return data.slice().reduce(
|
||||||
(csv, lineCells) => {
|
(csv, lineCells) => {
|
||||||
if (!lineCells || !lineCells.length) {
|
if (!lineCells || !lineCells.length) {
|
||||||
@ -35,4 +46,40 @@ export class ExtractResult {
|
|||||||
""
|
""
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
async walk(fn: (row: number, col: number, value: string) => void) {
|
||||||
|
let pms = Promise.resolve(null);
|
||||||
|
for (let i = 0; i < this._data.length; i++) {
|
||||||
|
let cells = this._data[i];
|
||||||
|
for (let j = 0; j < cells.length; j++) {
|
||||||
|
let row = i;
|
||||||
|
let col = j;
|
||||||
|
let value = cells[j];
|
||||||
|
pms = pms.then(
|
||||||
|
() => fn(row, col, value)
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return pms.catch(err => {
|
||||||
|
logger.error(err);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
async visit() {
|
||||||
|
let urls = parseUrls(this);
|
||||||
|
let tab = await getActiveTab(true) || await getActiveTab(false);
|
||||||
|
let succ = await ping(tab);
|
||||||
|
if (!succ) {
|
||||||
|
logger.error('Cannot contact with active tab.');
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
return urls.reduce(
|
||||||
|
(pms, url: string, i: number) => {
|
||||||
|
return pms.then(
|
||||||
|
async () => {
|
||||||
|
return redirectTab(tab, url, false);
|
||||||
|
});
|
||||||
|
}, Promise.resolve<void>(undefined)
|
||||||
|
).catch(err => {
|
||||||
|
logger.error(err);
|
||||||
|
});
|
||||||
|
}
|
||||||
}
|
}
|
||||||
@ -41,10 +41,11 @@ export class Task {
|
|||||||
get urls(): string[] {
|
get urls(): string[] {
|
||||||
return this._urls;
|
return this._urls;
|
||||||
}
|
}
|
||||||
get results(): string[][] {
|
get results(): ExtractResult {
|
||||||
return this._data_keys.reduce((p, c) => {
|
let rs: string[][] = this._data_keys.reduce((p, c) => {
|
||||||
return p.concat(this._data[c]);
|
return p.concat(this._data[c]);
|
||||||
}, []);
|
}, []);
|
||||||
|
return new ExtractResult(rs);
|
||||||
}
|
}
|
||||||
get fieldSelectors(): string[] {
|
get fieldSelectors(): string[] {
|
||||||
return this._fieldSelectors;
|
return this._fieldSelectors;
|
||||||
|
|||||||
@ -2,7 +2,7 @@ import { ExtractResult } from "./result";
|
|||||||
|
|
||||||
const URL_REG = /^\s*(https?):\/\//im;
|
const URL_REG = /^\s*(https?):\/\//im;
|
||||||
|
|
||||||
export function parseUrls(...args) {
|
export function parseUrls(...args): string[] {
|
||||||
if (!args.length) return [];
|
if (!args.length) return [];
|
||||||
let arg = args.shift();
|
let arg = args.shift();
|
||||||
if (arg instanceof Array) {
|
if (arg instanceof Array) {
|
||||||
|
|||||||
@ -1,3 +1,5 @@
|
|||||||
|
import { logger } from "../common/logger";
|
||||||
|
|
||||||
export function extract(itemsSelector: string, fieldSelectors: string[], expectedURL: string): string[][] {
|
export function extract(itemsSelector: string, fieldSelectors: string[], expectedURL: string): string[][] {
|
||||||
if (expectedURL && location.href != expectedURL) {
|
if (expectedURL && location.href != expectedURL) {
|
||||||
throw 'Target tab URL changed, aborting...';
|
throw 'Target tab URL changed, aborting...';
|
||||||
@ -14,19 +16,42 @@ export function extract(itemsSelector: string, fieldSelectors: string[], expecte
|
|||||||
item => {
|
item => {
|
||||||
return fieldSelectors.map(
|
return fieldSelectors.map(
|
||||||
selector => {
|
selector => {
|
||||||
|
let doClick = false;
|
||||||
|
if (selector.startsWith("!")) {
|
||||||
|
doClick = true;
|
||||||
|
selector = selector.substring(1);
|
||||||
|
}
|
||||||
let [cls, attr] = selector.split('@').slice(0, 2);
|
let [cls, attr] = selector.split('@').slice(0, 2);
|
||||||
let fieldVals = Array.from(item.querySelectorAll(cls));
|
let fieldElements: Element[];
|
||||||
if (!fieldVals.length) {
|
cls = cls.trim()
|
||||||
|
if (cls != "") {
|
||||||
|
fieldElements = Array.from(item.querySelectorAll(cls));
|
||||||
|
} else {
|
||||||
|
fieldElements = [item];
|
||||||
|
}
|
||||||
|
if (!fieldElements.length) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
fieldFound[selector] = true;
|
fieldFound[selector] = true;
|
||||||
return fieldVals.map(find => attr ? find[attr] : find.textContent.trim()).join('\n')
|
return fieldElements.map(find => {
|
||||||
|
if (doClick) {
|
||||||
|
let e = document.createEvent("MouseEvents");
|
||||||
|
e.initEvent("click", true, true);
|
||||||
|
find.dispatchEvent(e);
|
||||||
|
}
|
||||||
|
return attr ? find[attr] : find.textContent.trim();
|
||||||
|
}).join('\n')
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
);
|
);
|
||||||
|
// TODO: configurable wait logic
|
||||||
// if it exists a field, which is not found in any row, the sender should retry.
|
// if it exists a field, which is not found in any row, the sender should retry.
|
||||||
let shouldWait = fieldSelectors.reduce((p, c) => p || !fieldFound[c], false);
|
let notFoundFields = fieldSelectors.filter(f => !fieldFound[f]);
|
||||||
|
let shouldWait = notFoundFields.length > 0;
|
||||||
|
if (shouldWait) {
|
||||||
|
logger.debug('should wait for:', fieldSelectors.filter(f => !fieldFound[f]).join(','));
|
||||||
|
}
|
||||||
return shouldWait ? [] : results;
|
return shouldWait ? [] : results;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user