diff --git a/readme.md b/readme.md index e2f6025..c0ac18f 100644 --- a/readme.md +++ b/readme.md @@ -17,6 +17,8 @@ Extract current page ```js $('.item', ['a', 'a@href']); new Extractor().task('.item', ['a', 'a@href']).start(); +// fieldSelectors can be empty strings if items have no child to select +new Extractor().task('.item a', ['', '@href']).start(); ``` > `$(...args)` is the short form of `new Extractor().task(...args).start();`, which is introduced later. @@ -67,7 +69,9 @@ job = new Extractor().task('.search-list-item', ['a@href'], ["http://sample.com/ job.stop(); ``` -## Extract Attributes. +> Next time you call `job.start();`, the job will continues from where it stopped. + +## Extract Attributes e.g.: link text and target (use 'selector@attribute') @@ -75,6 +79,14 @@ e.g.: link text and target (use 'selector@attribute') new Extractor().task('.item', ['a', 'a@href']).start(); ``` +## Click Selected Elements + +The following clicks selected links and extracts link `text` and `href` + +```js +new Extractor().task('.item', ['!a', 'a@href']).start(); +``` + ## Advanced Usage ### Use Task Chain. @@ -202,6 +214,26 @@ To stop watching, you can either `close current window`, or: e.stop(); ``` +## Results Operation + +To get the results of a task: + +```js +let results = job.results(0); +``` + +Visit URLs (if any) in the results one by one: + +```js +results.visit(); +``` + +Walk through all results one by one: + +```js +results.walk((row,col,value)=>{console.log(value)}); +``` + ## Developpment Clone this project and execute: diff --git a/src/background/actions.ts b/src/background/actions.ts index 270a60d..7cdd218 100644 --- a/src/background/actions.ts +++ b/src/background/actions.ts @@ -8,14 +8,14 @@ import { logger } from "../common/logger"; * @param {string} url target URL * @returns {Promise} a promise of target URL */ -export function redirectTab(tab: chrome.tabs.Tab, url: string) { +export function redirectTab(tab: chrome.tabs.Tab, url: string, check?: boolean) { return queryUrl(tab).then(u => { if (url !== u) { let req: Request = { action: Actions.GOTO_URL, url: url } - let checker: ResponseChecker = async (r, err, tryCount): Promise => { + let checker: ResponseChecker = !check ? undefined : async (r, err, tryCount): Promise => { let queryErr: any; let newURL = await queryUrl(tab).catch(e => queryErr = e); if (queryErr) { @@ -23,10 +23,12 @@ export function redirectTab(tab: chrome.tabs.Tab, url: string) { } if (newURL == url) return url; if ( - tryCount % 1 == 0 && - !confirm('Cannot navigate to target url. \nPress OK to continue, Cancel to stop.') + confirm(`Cannot navigate to target url. +expected: ${url}\n +actual: ${newURL}\n +Press OK to continue, Cancel to retry. Close the tab to stop`) ) { - throw "Tasks stopped by user."; + return newURL; } return undefined; } @@ -111,7 +113,7 @@ export function scrollToBottom(tab: chrome.tabs.Tab) { return sendMessage(tab, req, 'Scroll to page bottom...'); } -export async function createTab(url: string, active: boolean) { +export async function createTab(url: string, active: boolean): Promise { return new Promise((resolve, reject) => { findIncognitoWindow().then( incognitoWindow => { @@ -179,7 +181,7 @@ export async function CreateIncognitoWindow() { }); } -export async function getActiveTab(currentWindow: boolean) { +export async function getActiveTab(currentWindow: boolean): Promise { return new Promise((resolve, reject) => { chrome.tabs.query({ active: true, @@ -190,7 +192,7 @@ export async function getActiveTab(currentWindow: boolean) { }) } -export async function getTabByID(id: number) { +export async function getTabByID(id: number): Promise { return new Promise((resolve, reject) => { chrome.tabs.get(id, function (tab) { chrome.runtime.lastError; diff --git a/src/background/extractor.ts b/src/background/extractor.ts index 6eaac2d..4a5ab8d 100644 --- a/src/background/extractor.ts +++ b/src/background/extractor.ts @@ -1,9 +1,9 @@ import { Task } from "./task"; -import { saveFile } from "./tools"; -import { createTab, getActiveTab, ping } from "./actions"; -import { ExtractResult } from "./result"; +import { parseUrls, saveFile } from "./tools"; +import { createTab, getActiveTab, ping, redirectTab } from "./actions"; import { logger } from "../common/logger"; import { caches } from "./caches"; +import { ExtractResult } from "./result"; export class Extractor { private _tasks: Task[] = []; @@ -50,6 +50,17 @@ export class Extractor { this._tasks.push(new Task(this._options, ...args)); return this; } + /** + * Add a task to Extractor. \n + * One Extractor could has multiple tasks, which orgnized in a task chian. + * If url arguments not given within later tasks, they will use previous task result as input (target url list). + * @param {...any} args itemsSelector, fieldSelectors, and more args to specify target urls. + */ + results(id?: number): ExtractResult { + id = this._checkTaskId(id); + if (id < 0) return; + return this._tasks[id].results; + } /** * Clear tasks and task caches. */ @@ -121,7 +132,7 @@ export class Extractor { if (i < from) return; if (i > 0) { let prevTask = this._tasks[i - 1]; - return task.execute(tab, new ExtractResult(prevTask.results)); + return task.execute(tab, prevTask.results); } return task.execute(tab); }); @@ -143,22 +154,22 @@ export class Extractor { let id = this._checkTaskId(taskid, this._tasks.length - 1); if (id < 0) return; let results = this._tasks[id].results - if (!results.length) { + let count = results.data.length + if (!count) { logger.info(`No result for task #${id}. Forget to call ".start()"?`); return; } - results.unshift(this._tasks[id].fieldSelectors); - let exResults = new ExtractResult(results); + results.header = this._tasks[id].fieldSelectors; let msg = ` -Please confirm to download (${results.length - 1} items): +Please confirm to download (${count} items): -${exResults.toString(50) || "- Empty -"} +${results.toString(50) || "- Empty -"} `.trim(); if (confirm(msg)) { - saveFile(exResults.toString(), "text/csv"); + saveFile(results.toString(), "text/csv"); } } - _checkTaskId(id: number, defaultId?: number) { + private _checkTaskId(id: number, defaultId?: number) { if (!this._tasks.length) { logger.info("No task found."); return -1; diff --git a/src/background/result.ts b/src/background/result.ts index 75cd840..73a9ab7 100644 --- a/src/background/result.ts +++ b/src/background/result.ts @@ -1,6 +1,11 @@ +import { logger } from "../common/logger"; +import { getActiveTab, ping, redirectTab } from "./actions"; +import { parseUrls } from "./tools"; + export class ExtractResult { + private _header: string[]; private _data: string[][] = []; - constructor(data) { + constructor(data: string[][]) { this._data = data || []; } row(index: number): string[] { @@ -14,11 +19,17 @@ export class ExtractResult { squash(): string[] { return this._data.reduce((p, c) => p.concat(c), []); } + set header(h: string[]) { + this._header = h + } get data(): string[][] { return this._data; } toString(rowsCount: number = 0): string { let data = rowsCount > 0 ? this._data.slice(0, rowsCount) : this._data; + if (this._header && this._header.length) { + data.unshift(this._header); + } return data.slice().reduce( (csv, lineCells) => { if (!lineCells || !lineCells.length) { @@ -35,4 +46,40 @@ export class ExtractResult { "" ); } + async walk(fn: (row: number, col: number, value: string) => void) { + let pms = Promise.resolve(null); + for (let i = 0; i < this._data.length; i++) { + let cells = this._data[i]; + for (let j = 0; j < cells.length; j++) { + let row = i; + let col = j; + let value = cells[j]; + pms = pms.then( + () => fn(row, col, value) + ) + } + } + return pms.catch(err => { + logger.error(err); + }); + } + async visit() { + let urls = parseUrls(this); + let tab = await getActiveTab(true) || await getActiveTab(false); + let succ = await ping(tab); + if (!succ) { + logger.error('Cannot contact with active tab.'); + return; + } + return urls.reduce( + (pms, url: string, i: number) => { + return pms.then( + async () => { + return redirectTab(tab, url, false); + }); + }, Promise.resolve(undefined) + ).catch(err => { + logger.error(err); + }); + } } \ No newline at end of file diff --git a/src/background/task.ts b/src/background/task.ts index 06a1cda..2ac73c4 100644 --- a/src/background/task.ts +++ b/src/background/task.ts @@ -41,10 +41,11 @@ export class Task { get urls(): string[] { return this._urls; } - get results(): string[][] { - return this._data_keys.reduce((p, c) => { + get results(): ExtractResult { + let rs: string[][] = this._data_keys.reduce((p, c) => { return p.concat(this._data[c]); }, []); + return new ExtractResult(rs); } get fieldSelectors(): string[] { return this._fieldSelectors; diff --git a/src/background/tools.ts b/src/background/tools.ts index d293c68..3dcd459 100644 --- a/src/background/tools.ts +++ b/src/background/tools.ts @@ -2,7 +2,7 @@ import { ExtractResult } from "./result"; const URL_REG = /^\s*(https?):\/\//im; -export function parseUrls(...args) { +export function parseUrls(...args): string[] { if (!args.length) return []; let arg = args.shift(); if (arg instanceof Array) { diff --git a/src/content/actions.ts b/src/content/actions.ts index 1ff86db..d21d0f6 100644 --- a/src/content/actions.ts +++ b/src/content/actions.ts @@ -1,3 +1,5 @@ +import { logger } from "../common/logger"; + export function extract(itemsSelector: string, fieldSelectors: string[], expectedURL: string): string[][] { if (expectedURL && location.href != expectedURL) { throw 'Target tab URL changed, aborting...'; @@ -14,19 +16,42 @@ export function extract(itemsSelector: string, fieldSelectors: string[], expecte item => { return fieldSelectors.map( selector => { + let doClick = false; + if (selector.startsWith("!")) { + doClick = true; + selector = selector.substring(1); + } let [cls, attr] = selector.split('@').slice(0, 2); - let fieldVals = Array.from(item.querySelectorAll(cls)); - if (!fieldVals.length) { + let fieldElements: Element[]; + cls = cls.trim() + if (cls != "") { + fieldElements = Array.from(item.querySelectorAll(cls)); + } else { + fieldElements = [item]; + } + if (!fieldElements.length) { return; } fieldFound[selector] = true; - return fieldVals.map(find => attr ? find[attr] : find.textContent.trim()).join('\n') + return fieldElements.map(find => { + if (doClick) { + let e = document.createEvent("MouseEvents"); + e.initEvent("click", true, true); + find.dispatchEvent(e); + } + return attr ? find[attr] : find.textContent.trim(); + }).join('\n') } ) } ); + // TODO: configurable wait logic // if it exists a field, which is not found in any row, the sender should retry. - let shouldWait = fieldSelectors.reduce((p, c) => p || !fieldFound[c], false); + let notFoundFields = fieldSelectors.filter(f => !fieldFound[f]); + let shouldWait = notFoundFields.length > 0; + if (shouldWait) { + logger.debug('should wait for:', fieldSelectors.filter(f => !fieldFound[f]).join(',')); + } return shouldWait ? [] : results; }