Compare commits

...

9 Commits

Author SHA1 Message Date
e87e7010ec improvements
* chance to continue on mismatch url for redirectTab
* support empty field selectors
* add Extractor.results()
* add ExtractResult.walk(), ExtractResult.visit()
* add ! directive to click elements
* code optimize
2021-04-20 14:20:05 +08:00
108ebb835f fix task running state 2021-04-20 12:01:10 +08:00
e0b0a5e986 add timeout for messaging 2021-04-20 12:00:59 +08:00
9cd25e3c1d update url 2021-04-19 15:58:04 +08:00
7827d385bd refactor 2020-06-16 14:45:36 +08:00
ade0670415 update readme 2020-01-17 11:01:13 +08:00
63aec616b1 code optimize 2020-01-17 09:38:40 +08:00
378883b626 check url change before extract data 2020-01-16 15:11:49 +08:00
c78f593c70 code optimize 2020-01-16 09:59:19 +08:00
16 changed files with 339 additions and 109 deletions

View File

@ -17,6 +17,8 @@ Extract current page
```js ```js
$('.item', ['a', 'a@href']); $('.item', ['a', 'a@href']);
new Extractor().task('.item', ['a', 'a@href']).start(); new Extractor().task('.item', ['a', 'a@href']).start();
// fieldSelectors can be empty strings if items have no child to select
new Extractor().task('.item a', ['', '@href']).start();
``` ```
> `$(...args)` is the short form of `new Extractor().task(...args).start();`, which is introduced later. > `$(...args)` is the short form of `new Extractor().task(...args).start();`, which is introduced later.
@ -67,7 +69,9 @@ job = new Extractor().task('.search-list-item', ['a@href'], ["http://sample.com/
job.stop(); job.stop();
``` ```
## Extract Attributes. > Next time you call `job.start();`, the job will continues from where it stopped.
## Extract Attributes
e.g.: link text and target (use 'selector@attribute') e.g.: link text and target (use 'selector@attribute')
@ -75,6 +79,14 @@ e.g.: link text and target (use 'selector@attribute')
new Extractor().task('.item', ['a', 'a@href']).start(); new Extractor().task('.item', ['a', 'a@href']).start();
``` ```
## Click Selected Elements
The following clicks selected links and extracts link `text` and `href`
```js
new Extractor().task('.item', ['!a', 'a@href']).start();
```
## Advanced Usage ## Advanced Usage
### Use Task Chain. ### Use Task Chain.
@ -185,6 +197,43 @@ e.start();
> The uploaded state will be cleaned in 30 seconds, if you don't load it. > The uploaded state will be cleaned in 30 seconds, if you don't load it.
## Watch Mode
Watch mode tries to exract data from every page you visit **in current window**.
```js
e = new Extractor();
e.task('.search-list-item', ['a@href'], ["http://sample.com/abc"])
.task('list-item', ["a.title", "p.content"]);
e.watch(1); // start watching for first task
```
To stop watching, you can either `close current window`, or:
```js
e.stop();
```
## Results Operation
To get the results of a task:
```js
let results = job.results(0);
```
Visit URLs (if any) in the results one by one:
```js
results.visit();
```
Walk through all results one by one:
```js
results.walk((row,col,value)=>{console.log(value)});
```
## Developpment ## Developpment
Clone this project and execute: Clone this project and execute:

View File

@ -1,6 +1,6 @@
import { Actions, Request } from "../common"; import { Actions, Request } from "../common";
import { sendMessage } from "./messaging"; import { sendMessage, ResponseChecker } from "./messaging";
import { logger } from "./common"; import { logger } from "../common/logger";
/** /**
* redirect tab to url. * redirect tab to url.
@ -8,25 +8,27 @@ import { logger } from "./common";
* @param {string} url target URL * @param {string} url target URL
* @returns {Promise<string[]>} a promise of target URL * @returns {Promise<string[]>} a promise of target URL
*/ */
export function redirectTab(tab: chrome.tabs.Tab, url: string) { export function redirectTab(tab: chrome.tabs.Tab, url: string, check?: boolean) {
return queryUrl(tab).then(u => { return queryUrl(tab).then(u => {
if (url !== u) { if (url !== u) {
let req: Request = { let req: Request = {
action: Actions.GOTO_URL, action: Actions.GOTO_URL,
url: url url: url
} }
let checker = async (u, err, tryCount): Promise<string> => { let checker: ResponseChecker<string> = !check ? undefined : async (r, err, tryCount): Promise<string> => {
let queryErr: any; let queryErr: any;
let newURL = await queryUrl(tab).catch(e => queryErr = e); let newURL = await queryUrl(tab).catch(e => queryErr = e);
if (queryErr) { if (queryErr) {
return Promise.reject(queryErr); throw queryErr;
} }
if (newURL == url) return url; if (newURL == url) return url;
if ( if (
tryCount % 1 == 0 && confirm(`Cannot navigate to target url.
!confirm('Cannot navigate to target url. \nPress OK to continue, Cancel to stop.') expected: ${url}\n
actual: ${newURL}\n
Press OK to continue, Cancel to retry. Close the tab to stop`)
) { ) {
return Promise.reject("Tasks stopped by user."); return newURL;
} }
return undefined; return undefined;
} }
@ -42,13 +44,16 @@ export function redirectTab(tab: chrome.tabs.Tab, url: string) {
* @param {Array<string>} fieldSelectors fields selectors for selecting fields (data columns) under each item * @param {Array<string>} fieldSelectors fields selectors for selecting fields (data columns) under each item
* @returns {Promise<string[]>} a promise of extracted data * @returns {Promise<string[]>} a promise of extracted data
*/ */
export function extractTabData(tab: chrome.tabs.Tab, itemsSelector: string, fieldSelectors: string[], askOnfail?: boolean) { export function extractTabData(tab: chrome.tabs.Tab, itemsSelector: string, fieldSelectors: string[], expectedURL?: string, askOnfail?: boolean) {
let req = { let req: Request = {
action: Actions.EXTRACT, action: Actions.EXTRACT,
itemsSelector: itemsSelector, itemsSelector: itemsSelector,
fieldSelectors: fieldSelectors fieldSelectors: fieldSelectors,
url: expectedURL,
} }
let checker = (result, err, tryCount) => { let checker: ResponseChecker<string[][]> = (response, err, tryCount) => {
if (response.error) throw response.error;
let result = response.result;
if (!result || !result.length) { if (!result || !result.length) {
if ( if (
tryCount % 20 == 0 && ( tryCount % 20 == 0 && (
@ -76,8 +81,10 @@ export async function ping(tab, count = 1) {
let req = { let req = {
action: Actions.PING action: Actions.PING
} }
let checker = (r: string, e, c) => r == "pong" ? r : undefined; let checker: ResponseChecker<string> = (r, e, c) =>
let pong = await sendMessage<string>(tab, req, 'Check tab availability...', checker, 1000, count).catch(() => { }); r.result == "pong" ? r.result : undefined;
let pong = await sendMessage<string>(tab, req, 'Check tab availability...', checker, 1000, 1000, count).catch(() => { });
return pong == "pong"; return pong == "pong";
} }
@ -106,7 +113,7 @@ export function scrollToBottom(tab: chrome.tabs.Tab) {
return sendMessage(tab, req, 'Scroll to page bottom...'); return sendMessage(tab, req, 'Scroll to page bottom...');
} }
export async function createTab(url: string, active: boolean) { export async function createTab(url: string, active: boolean): Promise<chrome.tabs.Tab> {
return new Promise((resolve, reject) => { return new Promise((resolve, reject) => {
findIncognitoWindow().then( findIncognitoWindow().then(
incognitoWindow => { incognitoWindow => {
@ -174,7 +181,7 @@ export async function CreateIncognitoWindow() {
}); });
} }
export async function getActiveTab(currentWindow: boolean) { export async function getActiveTab(currentWindow: boolean): Promise<chrome.tabs.Tab> {
return new Promise((resolve, reject) => { return new Promise((resolve, reject) => {
chrome.tabs.query({ chrome.tabs.query({
active: true, active: true,
@ -185,7 +192,7 @@ export async function getActiveTab(currentWindow: boolean) {
}) })
} }
export async function getTabByID(id: number) { export async function getTabByID(id: number): Promise<chrome.tabs.Tab> {
return new Promise((resolve, reject) => { return new Promise((resolve, reject) => {
chrome.tabs.get(id, function (tab) { chrome.tabs.get(id, function (tab) {
chrome.runtime.lastError; chrome.runtime.lastError;

View File

@ -1,4 +1,4 @@
import { logger } from "./common"; import { logger } from "../common/logger";
import { Actions } from "../common"; import { Actions } from "../common";
import { messageSubscribers } from "./messaging"; import { messageSubscribers } from "./messaging";
@ -26,4 +26,6 @@ export class Caches {
} }
}, 30000); }, 30000);
} }
} }
export const caches = new Caches();

View File

@ -1,6 +0,0 @@
import { Logger, LOGGER_LEVEL } from "./logger";
import { Caches } from "./caches";
export const caches = new Caches();
export const logger = new Logger(LOGGER_LEVEL.DEBUG, LOGGER_LEVEL.DISABLED);
export const URL_REG = /^\s*(https?):\/\//im;

View File

@ -1,7 +1,8 @@
import { Task } from "./task"; import { Task } from "./task";
import { saveFile } from "./tools"; import { parseUrls, saveFile } from "./tools";
import { createTab, getActiveTab, ping } from "./actions"; import { createTab, getActiveTab, ping, redirectTab } from "./actions";
import { logger, caches } from "./common"; import { logger } from "../common/logger";
import { caches } from "./caches";
import { ExtractResult } from "./result"; import { ExtractResult } from "./result";
export class Extractor { export class Extractor {
@ -11,6 +12,14 @@ export class Extractor {
constructor(options?) { constructor(options?) {
if (options) this._options = options; if (options) this._options = options;
} }
static async ping(count: number = 1) {
let tab = await getActiveTab(true) || await getActiveTab(false);
let succ = await ping(tab, count);
if (!succ) {
logger.error('Cannot contact with active tab.');
return;
}
}
/** /**
* Save current state, in case we restore it later. * Save current state, in case we restore it later.
*/ */
@ -41,6 +50,17 @@ export class Extractor {
this._tasks.push(new Task(this._options, ...args)); this._tasks.push(new Task(this._options, ...args));
return this; return this;
} }
/**
* Add a task to Extractor. \n
* One Extractor could has multiple tasks, which orgnized in a task chian.
* If url arguments not given within later tasks, they will use previous task result as input (target url list).
* @param {...any} args itemsSelector, fieldSelectors, and more args to specify target urls.
*/
results(id?: number): ExtractResult {
id = this._checkTaskId(id);
if (id < 0) return;
return this._tasks[id].results;
}
/** /**
* Clear tasks and task caches. * Clear tasks and task caches.
*/ */
@ -112,7 +132,7 @@ export class Extractor {
if (i < from) return; if (i < from) return;
if (i > 0) { if (i > 0) {
let prevTask = this._tasks[i - 1]; let prevTask = this._tasks[i - 1];
return task.execute(tab, new ExtractResult(prevTask.results)); return task.execute(tab, prevTask.results);
} }
return task.execute(tab); return task.execute(tab);
}); });
@ -134,22 +154,22 @@ export class Extractor {
let id = this._checkTaskId(taskid, this._tasks.length - 1); let id = this._checkTaskId(taskid, this._tasks.length - 1);
if (id < 0) return; if (id < 0) return;
let results = this._tasks[id].results let results = this._tasks[id].results
if (!results.length) { let count = results.data.length
if (!count) {
logger.info(`No result for task #${id}. Forget to call ".start()"?`); logger.info(`No result for task #${id}. Forget to call ".start()"?`);
return; return;
} }
results.unshift(this._tasks[id].fieldSelectors); results.header = this._tasks[id].fieldSelectors;
let exResults = new ExtractResult(results);
let msg = ` let msg = `
Please confirm to download (${results.length - 1} items) Please confirm to download (${count} items)
${exResults.toString(50) || "- Empty -"} ${results.toString(50) || "- Empty -"}
`.trim(); `.trim();
if (confirm(msg)) { if (confirm(msg)) {
saveFile(exResults.toString(), "text/csv"); saveFile(results.toString(), "text/csv");
} }
} }
_checkTaskId(id: number, defaultId?: number) { private _checkTaskId(id: number, defaultId?: number) {
if (!this._tasks.length) { if (!this._tasks.length) {
logger.info("No task found."); logger.info("No task found.");
return -1; return -1;

View File

@ -1,7 +1,11 @@
import { Request, Actions } from "../common"; import { Request, Actions, Response } from "../common";
import { getTabByID } from "./actions"; import { getTabByID } from "./actions";
import { logger } from "./common"; import { logger } from "../common/logger";
export type ResponseCheckerSync<T> = (r: Response<T>, err: chrome.runtime.LastError, count: number) => T;
export type ResponseCheckerAsync<T> = (r: Response<T>, err: chrome.runtime.LastError, count: number) => Promise<T>;
export type ResponseChecker<T> = ResponseCheckerSync<T> | ResponseCheckerAsync<T>;
/** /**
* Sending a message to target tab repeatedly until the response is not undefined. * Sending a message to target tab repeatedly until the response is not undefined.
* @param {object} tab the table where to send the message * @param {object} tab the table where to send the message
@ -17,12 +21,14 @@ import { logger } from "./common";
*/ */
export function sendMessage<T>( export function sendMessage<T>(
tab: chrome.tabs.Tab, tab: chrome.tabs.Tab,
req, req: Request,
log?: string, log?: string,
dataChecker?: (r: T, err: chrome.runtime.LastError, count: number) => T | Promise<T>, dataChecker?: ResponseChecker<T>,
timeout?: number,
interval?: number, interval?: number,
limit?: number limit?: number
) { ) {
timeout = timeout || 10;
interval = interval || 500; interval = interval || 500;
limit = isNaN(limit) ? 0 : limit; limit = isNaN(limit) ? 0 : limit;
let count = 0; let count = 0;
@ -43,33 +49,23 @@ export function sendMessage<T>(
return; return;
} }
count++; count++;
chrome.tabs.sendMessage(tab.id, req, async (r: T) => { let timeout = setTimeout(() => { reject(`${Actions[req.action]} requset timeout after ${timeout}s`) }, 10000);
chrome.tabs.sendMessage(tab.id, req, async (r: Response<T>) => {
clearTimeout(timeout);
// check error but do nothing until dataChecker. // check error but do nothing until dataChecker.
let err = chrome.runtime.lastError; let err = chrome.runtime.lastError;
let result: T = r; let [result, error] = await checkResponse(dataChecker, r, err, count);
if (error) {
if (dataChecker) { reject(error);
let pms = dataChecker(r, err, count); return;
// don't catch if it's not a Promise
if (pms instanceof Promise) {
let checkerError: any;
pms = pms.catch(e => checkerError = e);
result = await pms;
if (checkerError) {
reject(checkerError);
return;
}
} else {
result = pms;
}
} }
let flag = result !== undefined;
let flag = result !== undefined && result !== null;
if (log) logger.info(log, flag ? '(OK)' : '(failed)'); if (log) logger.info(log, flag ? '(OK)' : '(failed)');
if (flag) { if (flag) {
resolve(result); resolve(result);
} else { } else {
setTimeout(() => { setTimeout(() => {
logger.debug('Invalid response', r, 'retry...');
loop(); loop();
}, interval); }, interval);
} }
@ -78,7 +74,43 @@ export function sendMessage<T>(
}); });
} }
export type ActionSubscriber = (request: Request, sender: chrome.runtime.MessageSender, sendResponse: (response?: any) => void) => void | Promise<void>; async function checkResponse<T>(
dataChecker: ResponseChecker<T>,
response: Response<T>,
error: chrome.runtime.LastError,
tryCount: number
): Promise<[T, string]> {
// response could be undefined if the content script is interrupted.
// don't check, tell sendMessage to retry.
if (!response) return [undefined, undefined];
if (!dataChecker) {
return [response.result, response.error];
}
let result: T;
let pms: T | Promise<T>;
try {
pms = dataChecker(response, error, tryCount);
} catch (err) {
return [undefined, err];
}
// don't catch if it's not a Promise
if (pms instanceof Promise) {
let checkerError: any;
pms = pms.catch(e => checkerError = e);
result = await pms;
if (checkerError) {
return [undefined, checkerError];
}
} else {
result = pms;
}
return [result, undefined];
}
export type ActionSubscriberSync = (request: Request, sender: chrome.runtime.MessageSender, sendResponse: (response?: any) => void) => void;
export type ActionSubscriberAsync = (request: Request, sender: chrome.runtime.MessageSender, sendResponse: (response?: any) => void) => Promise<void>;
export type ActionSubscriber = ActionSubscriberSync | ActionSubscriberAsync;
class MessageSubscribers { class MessageSubscribers {
private listeners: { [key: number]: ActionSubscriber[] } = {}; private listeners: { [key: number]: ActionSubscriber[] } = {};
addListener(action: Actions, subscriber: ActionSubscriber) { addListener(action: Actions, subscriber: ActionSubscriber) {

View File

@ -1,6 +1,11 @@
import { logger } from "../common/logger";
import { getActiveTab, ping, redirectTab } from "./actions";
import { parseUrls } from "./tools";
export class ExtractResult { export class ExtractResult {
private _header: string[];
private _data: string[][] = []; private _data: string[][] = [];
constructor(data) { constructor(data: string[][]) {
this._data = data || []; this._data = data || [];
} }
row(index: number): string[] { row(index: number): string[] {
@ -14,11 +19,17 @@ export class ExtractResult {
squash(): string[] { squash(): string[] {
return this._data.reduce((p, c) => p.concat(c), []); return this._data.reduce((p, c) => p.concat(c), []);
} }
set header(h: string[]) {
this._header = h
}
get data(): string[][] { get data(): string[][] {
return this._data; return this._data;
} }
toString(rowsCount: number = 0): string { toString(rowsCount: number = 0): string {
let data = rowsCount > 0 ? this._data.slice(0, rowsCount) : this._data; let data = rowsCount > 0 ? this._data.slice(0, rowsCount) : this._data;
if (this._header && this._header.length) {
data.unshift(this._header);
}
return data.slice().reduce( return data.slice().reduce(
(csv, lineCells) => { (csv, lineCells) => {
if (!lineCells || !lineCells.length) { if (!lineCells || !lineCells.length) {
@ -26,6 +37,7 @@ export class ExtractResult {
} }
let line = lineCells.reduce( let line = lineCells.reduce(
(lineText, cell, idx) => { (lineText, cell, idx) => {
cell = cell || "";
cell = '"' + cell.trim().replace(/"/g, '""') + '"'; cell = '"' + cell.trim().replace(/"/g, '""') + '"';
return lineText + cell + (idx == lineCells.length - 1 ? "" : ",") return lineText + cell + (idx == lineCells.length - 1 ? "" : ",")
}, ""); }, "");
@ -34,4 +46,40 @@ export class ExtractResult {
"" ""
); );
} }
async walk(fn: (row: number, col: number, value: string) => void) {
let pms = Promise.resolve(null);
for (let i = 0; i < this._data.length; i++) {
let cells = this._data[i];
for (let j = 0; j < cells.length; j++) {
let row = i;
let col = j;
let value = cells[j];
pms = pms.then(
() => fn(row, col, value)
)
}
}
return pms.catch(err => {
logger.error(err);
});
}
async visit() {
let urls = parseUrls(this);
let tab = await getActiveTab(true) || await getActiveTab(false);
let succ = await ping(tab);
if (!succ) {
logger.error('Cannot contact with active tab.');
return;
}
return urls.reduce(
(pms, url: string, i: number) => {
return pms.then(
async () => {
return redirectTab(tab, url, false);
});
}, Promise.resolve<void>(undefined)
).catch(err => {
logger.error(err);
});
}
} }

View File

@ -20,7 +20,7 @@ function(itemsSelector:string, fieldSelectors:string[], urls:string[]);
$(".item", ["a", "a@href"]); $(".item", ["a", "a@href"]);
## See Detailed Help: ## See Detailed Help:
https://git.jebbs.co/jebbs/data-extracter-extesion https://git.qjebbs.com/jebbs/data-extracter-extesion
`.trim(); `.trim();
export function testArgs(...args: any) { export function testArgs(...args: any) {

View File

@ -4,7 +4,7 @@ import { testArgs, signitures } from "./signiture";
import { ExtractResult } from "./result"; import { ExtractResult } from "./result";
import { messageSubscribers, ActionSubscriber } from "./messaging"; import { messageSubscribers, ActionSubscriber } from "./messaging";
import { Actions } from "../common"; import { Actions } from "../common";
import { logger } from "./common"; import { logger } from "../common/logger";
export class Task { export class Task {
private _data: { [key: string]: string[][] } = {}; private _data: { [key: string]: string[][] } = {};
@ -41,10 +41,11 @@ export class Task {
get urls(): string[] { get urls(): string[] {
return this._urls; return this._urls;
} }
get results(): string[][] { get results(): ExtractResult {
return this._data_keys.reduce((p, c) => { let rs: string[][] = this._data_keys.reduce((p, c) => {
return p.concat(this._data[c]); return p.concat(this._data[c]);
}, []); }, []);
return new ExtractResult(rs);
} }
get fieldSelectors(): string[] { get fieldSelectors(): string[] {
return this._fieldSelectors; return this._fieldSelectors;
@ -73,6 +74,7 @@ export class Task {
logger.info("No window to watch..."); logger.info("No window to watch...");
return; return;
} }
let watchTaskID = 0;
let listener: ActionSubscriber = async (request, sender, sendResponse) => { let listener: ActionSubscriber = async (request, sender, sendResponse) => {
let findWindow = await getWindowByID(window.id); let findWindow = await getWindowByID(window.id);
if (!findWindow) { if (!findWindow) {
@ -82,32 +84,46 @@ export class Task {
} }
// only watch current window. // only watch current window.
if (sender.tab.windowId != window.id) return; if (sender.tab.windowId != window.id) return;
let taskID = watchTaskID++;
logger.info(`Watcher #${taskID} starts.`);
let pm = this.makeOptionalTasks(sender.tab); let pm = this.makeOptionalTasks(sender.tab);
return pm.then( return pm.then(
() => extractTabData(sender.tab, this._itemsSelector, this._fieldSelectors, false) () => extractTabData(sender.tab, this._itemsSelector, this._fieldSelectors, sender.tab.url, true)
).then( ).then(
results => { results => {
if (results && results.length) { if (results && results.length) {
this.saveResult(results, sender.tab.url); this.saveResult(results, sender.tab.url);
} }
logger.info(`Watcher #${taskID} ends.`);
} }
).catch( ).catch(
e => logger.error(e) e => logger.error(`Watcher #${taskID} ends with:`, e)
) )
} }
this._listeners.push(listener); this._listeners.push(listener);
messageSubscribers.addListener(Actions.REPORT_NEW_PAGE, listener); messageSubscribers.addListener(Actions.REPORT_NEW_PAGE, listener);
} }
async execute(tab: chrome.tabs.Tab, upstreamData?: ExtractResult): Promise<void> { async execute(tab: chrome.tabs.Tab, upstreamData?: ExtractResult): Promise<void> {
if (!tab) return Promise.reject("No tab to execute the task."); if (!tab) throw "No tab to execute the task.";
if (this._running) return Promise.reject("The task is running. Please wait..."); if (this._running) throw "The task is running. Please wait...";
this._running = true; this._running = true;
let urls = this._urls let urls = this._urls
if (!urls.length) { if (!urls.length) {
if (upstreamData) { if (upstreamData) {
urls = parseUrls(upstreamData); urls = parseUrls(upstreamData);
} else { } else {
urls = [await queryUrl(tab)]; let tabURL: string;
await queryUrl(tab)
.then(u => {
tabURL = u;
})
.catch(() => {
e => {
this._running = false;
return Promise.reject(e);
}
});
urls = [tabURL];
} }
} }
return urls.reduce((p, url, i) => p.then( return urls.reduce((p, url, i) => p.then(
@ -130,13 +146,13 @@ export class Task {
if (results && results.length) { if (results && results.length) {
let lastURL = urls[urls.length - 1]; let lastURL = urls[urls.length - 1];
this.saveResult(results, lastURL); this.saveResult(results, lastURL);
this._running = false;
} }
this._running = false;
} }
).catch( ).catch(
e => { e => {
this._running = false; this._running = false;
return Promise.reject(e); throw e;
} }
); );
} }
@ -148,7 +164,7 @@ export class Task {
return pm; return pm;
} }
private runningCheck(fn: () => Promise<any>): Promise<any> { private runningCheck(fn: () => Promise<any>): Promise<any> {
if (!this._running) return Promise.reject("The task is stopped by user."); if (!this._running) throw "The task is stopped by user.";
return fn(); return fn();
} }
private saveResult(results, key) { private saveResult(results, key) {

View File

@ -1,7 +1,8 @@
import { URL_REG } from "./common";
import { ExtractResult } from "./result"; import { ExtractResult } from "./result";
export function parseUrls(...args) { const URL_REG = /^\s*(https?):\/\//im;
export function parseUrls(...args): string[] {
if (!args.length) return []; if (!args.length) return [];
let arg = args.shift(); let arg = args.shift();
if (arg instanceof Array) { if (arg instanceof Array) {

View File

@ -20,4 +20,9 @@ export interface Request {
url?: string url?: string
fileName?: string fileName?: string
state?: string state?: string
}
export interface Response<T> {
result: T;
error: string;
} }

View File

@ -13,7 +13,7 @@ export class Logger {
constructor(logLevel, notifyLevel) { constructor(logLevel, notifyLevel) {
if (logLevel) this._log_level = logLevel; if (logLevel) this._log_level = logLevel;
if (notifyLevel) this._notify_level = notifyLevel; if (notifyLevel) this._notify_level = notifyLevel;
chrome.notifications.onClosed.addListener((id, byUser) => { this._notify_level = undefined }); if (chrome.notifications) chrome.notifications.onClosed.addListener((id, byUser) => { this._notify_level = undefined });
} }
get logLevel() { get logLevel() {
return this._log_level; return this._log_level;
@ -71,3 +71,5 @@ export class Logger {
); );
} }
} }
export const logger = new Logger(LOGGER_LEVEL.DEBUG, LOGGER_LEVEL.DISABLED);

View File

@ -1,4 +1,9 @@
export function extract(itemsSelector: string, fieldSelectors: string[]): string[][] { import { logger } from "../common/logger";
export function extract(itemsSelector: string, fieldSelectors: string[], expectedURL: string): string[][] {
if (expectedURL && location.href != expectedURL) {
throw 'Target tab URL changed, aborting...';
}
// since some elements may be loaded asynchronously. // since some elements may be loaded asynchronously.
// if one field is never found, we should return undefined, // if one field is never found, we should return undefined,
// so that senders can detect to retry until elements loaded. // so that senders can detect to retry until elements loaded.
@ -11,19 +16,42 @@ export function extract(itemsSelector: string, fieldSelectors: string[]): string
item => { item => {
return fieldSelectors.map( return fieldSelectors.map(
selector => { selector => {
let doClick = false;
if (selector.startsWith("!")) {
doClick = true;
selector = selector.substring(1);
}
let [cls, attr] = selector.split('@').slice(0, 2); let [cls, attr] = selector.split('@').slice(0, 2);
let fieldVals = Array.from(item.querySelectorAll(cls)); let fieldElements: Element[];
if (!fieldVals.length) { cls = cls.trim()
if (cls != "") {
fieldElements = Array.from(item.querySelectorAll(cls));
} else {
fieldElements = [item];
}
if (!fieldElements.length) {
return; return;
} }
fieldFound[selector] = true; fieldFound[selector] = true;
return fieldVals.map(find => attr ? find[attr] : find.textContent.trim()).join('\n') return fieldElements.map(find => {
if (doClick) {
let e = document.createEvent("MouseEvents");
e.initEvent("click", true, true);
find.dispatchEvent(e);
}
return attr ? find[attr] : find.textContent.trim();
}).join('\n')
} }
) )
} }
); );
// TODO: configurable wait logic
// if it exists a field, which is not found in any row, the sender should retry. // if it exists a field, which is not found in any row, the sender should retry.
let shouldWait = fieldSelectors.reduce((p, c) => p || !fieldFound[c], false); let notFoundFields = fieldSelectors.filter(f => !fieldFound[f]);
let shouldWait = notFoundFields.length > 0;
if (shouldWait) {
logger.debug('should wait for:', fieldSelectors.filter(f => !fieldFound[f]).join(','));
}
return shouldWait ? [] : results; return shouldWait ? [] : results;
} }

View File

@ -1,4 +1,4 @@
import { Request, Actions } from '../common'; import { Request, Actions, Response } from '../common';
import { scrollToBottom, extract } from './actions'; import { scrollToBottom, extract } from './actions';
let asleep = false; let asleep = false;
@ -20,30 +20,56 @@ chrome.runtime.sendMessage(<Request>{
action: Actions.REPORT_NEW_PAGE, action: Actions.REPORT_NEW_PAGE,
}); });
async function doAction(request: Request, sender: chrome.runtime.MessageSender) { async function doAction(request: Request, sender: chrome.runtime.MessageSender): Promise<Response<any>> {
switch (request.action) { let result: any;
case Actions.EXTRACT: let error: string;
let data = extract(request.itemsSelector, request.fieldSelectors); try {
return data; switch (request.action) {
case Actions.GOTO_URL: case Actions.EXTRACT:
window.location.replace(request.url); result = extract(request.itemsSelector, request.fieldSelectors, request.url);
// should not recieve any request until the page & script reload break;
asleep = true; case Actions.GOTO_URL:
return request.url; window.location.replace(request.url);
case Actions.PING: // should not recieve any request until the page & script reload
return "pong"; asleep = true;
case Actions.QUERY_URL: result = request.url;
return window.location.href; break;
case Actions.SCROLL_BOTTOM: case Actions.PING:
return scrollToBottom(); result = "pong";
case Actions.SLEEP: break;
asleep = true; case Actions.QUERY_URL:
return "Content script is sleeping."; result = window.location.href;
case Actions.WAKEUP: break;
asleep = false; case Actions.SCROLL_BOTTOM:
return "Content script is available."; result = scrollToBottom();
default: break;
break; case Actions.SLEEP:
asleep = true;
result = "Content script is sleeping.";
break;
case Actions.WAKEUP:
asleep = false;
result = "Content script is available.";
break;
default:
error = 'Unsupported action.'
break;
}
} catch (err) {
if (err instanceof Error) {
error = err.message;
} else {
error = err;
}
} }
return newResponse(result, error);
} }
function newResponse<T>(result: T, err?: string): Response<T> {
let r: Response<T> = {
result: result,
error: err,
}
return r;
}

View File

@ -10,7 +10,7 @@ window.onload = function () {
document.querySelector('#link-document') document.querySelector('#link-document')
.addEventListener('click', () => { .addEventListener('click', () => {
chrome.tabs.create({ chrome.tabs.create({
'url': `https://git.jebbs.co/jebbs/data-extracter-extesion` 'url': `https://git.qjebbs.com/jebbs/data-extracter-extesion`
}); });
}) })
document.querySelector('#state-input') document.querySelector('#state-input')

View File

@ -52,7 +52,7 @@
<p> <p>
<b>Full document at:</b> <b>Full document at:</b>
<br> <br>
<a href="#" id="link-document">https://git.jebbs.co/jebbs/data-extracter-extesion</a> <a href="#" id="link-document">https://git.qjebbs.com/jebbs/data-extracter-extesion</a>
</p> </p>
</div> </div>
</div> </div>