Compare commits

...

6 Commits

Author SHA1 Message Date
d82010686d Extractor.watch() improvements
- only watch current window
- stop watch on window close
- don't ask user to confirm when fails
2020-01-15 18:28:28 +08:00
7644a1363f Extractor.watch() 2020-01-15 17:53:23 +08:00
3338f78d91 code optimize 2020-01-15 15:21:17 +08:00
da7ae057f4 Extractor.stop() 2020-01-15 14:18:31 +08:00
2224db1ad1 incognito window first 2020-01-15 14:05:57 +08:00
790c95ffc3 clean state cache in 30 seconds 2020-01-14 17:03:14 +08:00
10 changed files with 298 additions and 79 deletions

View File

@ -8,15 +8,19 @@ All you need to do is:
- Find out the selectors for target data - Find out the selectors for target data
- Type scripts in the console of `extension backgroud page`, as introduced bellow. - Type scripts in the console of `extension backgroud page`, as introduced bellow.
![](images/console.png) ![](template/assets/console.png)
## Qucik Start ## Qucik Start
Extract current page Extract current page
```js ```js
$('.item', ['a', 'a@href']); $('.item', ['a', 'a@href']);
new Extractor().task('.item', ['a', 'a@href']).start();
``` ```
> `$(...args)` is the short form of `new Extractor().task(...args).start();`, which is introduced later.
Extract multiple pages (1-10, interval 1) Extract multiple pages (1-10, interval 1)
```js ```js
@ -52,10 +56,16 @@ function (itemsSelector:string, fieldSelectors:string[], urls:ExtractResult)
## Stop Tasks ## Stop Tasks
The only way to stop tasks before its finish, is `Closing the target tab`. Close the target tab, in which current tasks is running.
> Tasks wait for their target elements' appearance, given some elements were loaded asynchronously. Or use `job.stop()`:
> If you typed wrong selectors, the task waits forever for elements which don't exists.
```js
job = new Extractor().task('.search-list-item', ['a@href'], ["http://sample.com/abc"])
.task('list-item', ["a.title", "p.content"])
.start();
job.stop();
```
## Extract Attributes. ## Extract Attributes.
@ -126,17 +136,17 @@ e.export(1)
Sometimes, it's hard to finish them in an single execution, that why we need "Continuing of Tasks". Sometimes, it's hard to finish them in an single execution, that why we need "Continuing of Tasks".
You can always continue tasks (with following), even it stops in the middle of a task: You can always continue tasks by start it again, not matter in what phase it stops.
```js ```js
e.start() e.start()
``` ```
The `Extractor` kept the state of last execution, and starts from where it stopped. The `Extractor` kept the execution state, and starts from where it stopped.
### Restart Tasks ### Restart Tasks
What should I do, if I don't like to continue from last state, but restart from certain task? What if I don't like to continue from last state, but restart certain tasks?
```js ```js
// restart all tasks // restart all tasks
@ -166,12 +176,15 @@ e.save();
Load the state: Load the state:
Open the popup window, upload the saved state file. Then, and in the backgoud console: Open the popup window, upload the saved state file. Then, and in the backgroud console:
```js ```js
e = new Extractor().load(); e = new Extractor().load();
e.start();
``` ```
> The uploaded state will be cleaned in 30 seconds, if you don't load it.
## Developpment ## Developpment
Clone this project and execute: Clone this project and execute:

View File

@ -1,5 +1,6 @@
import { ACTION_GOTO_URL, ACTION_EXTRACT, ACTION_PING as ACTION_PING, ACTION_QUERY_URL, ACTION_SCROLL_BOTTOM } from "../common"; import { Actions, Request } from "../common";
import { sendMessage } from "./messaging"; import { sendMessage } from "./messaging";
import { logger } from "./common";
/** /**
* redirect tab to url. * redirect tab to url.
@ -10,8 +11,8 @@ import { sendMessage } from "./messaging";
export function redirectTab(tab: chrome.tabs.Tab, url: string) { export function redirectTab(tab: chrome.tabs.Tab, url: string) {
return queryUrl(tab).then(u => { return queryUrl(tab).then(u => {
if (url !== u) { if (url !== u) {
let req = { let req: Request = {
action: ACTION_GOTO_URL, action: Actions.GOTO_URL,
url: url url: url
} }
let checker = async (u, err, tryCount): Promise<string> => { let checker = async (u, err, tryCount): Promise<string> => {
@ -22,7 +23,7 @@ export function redirectTab(tab: chrome.tabs.Tab, url: string) {
} }
if (newURL == url) return url; if (newURL == url) return url;
if ( if (
tryCount % 5 == 0 && tryCount % 1 == 0 &&
!confirm('Cannot navigate to target url. \nPress OK to continue, Cancel to stop.') !confirm('Cannot navigate to target url. \nPress OK to continue, Cancel to stop.')
) { ) {
return Promise.reject("Tasks stopped by user."); return Promise.reject("Tasks stopped by user.");
@ -41,15 +42,21 @@ export function redirectTab(tab: chrome.tabs.Tab, url: string) {
* @param {Array<string>} fieldSelectors fields selectors for selecting fields (data columns) under each item * @param {Array<string>} fieldSelectors fields selectors for selecting fields (data columns) under each item
* @returns {Promise<string[]>} a promise of extracted data * @returns {Promise<string[]>} a promise of extracted data
*/ */
export function extractTabData(tab, itemsSelector, fieldSelectors) { export function extractTabData(tab: chrome.tabs.Tab, itemsSelector: string, fieldSelectors: string[], askOnfail?: boolean) {
let req = { let req = {
action: ACTION_EXTRACT, action: Actions.EXTRACT,
itemsSelector: itemsSelector, itemsSelector: itemsSelector,
fieldSelectors: fieldSelectors fieldSelectors: fieldSelectors
} }
let checker = (result, err, tryCount) => { let checker = (result, err, tryCount) => {
if (!result || !result.length) { if (!result || !result.length) {
if (tryCount % 20 == 0 && confirm('No data found in current page. \n\nContinue to next page?')) { if (
tryCount % 20 == 0 && (
!askOnfail ||
confirm('No data found in current page. \n\nContinue to next page?')
)
) {
logger.warn(`Failed after ${tryCount} tries: ${tab.url}`)
return []; return [];
} else { } else {
return undefined; return undefined;
@ -67,7 +74,7 @@ export function extractTabData(tab, itemsSelector, fieldSelectors) {
*/ */
export async function ping(tab, count = 1) { export async function ping(tab, count = 1) {
let req = { let req = {
action: ACTION_PING action: Actions.PING
} }
let checker = (r: string, e, c) => r == "pong" ? r : undefined; let checker = (r: string, e, c) => r == "pong" ? r : undefined;
let pong = await sendMessage<string>(tab, req, 'Check tab availability...', checker, 1000, count).catch(() => { }); let pong = await sendMessage<string>(tab, req, 'Check tab availability...', checker, 1000, count).catch(() => { });
@ -81,7 +88,7 @@ export async function ping(tab, count = 1) {
*/ */
export function queryUrl(tab: chrome.tabs.Tab) { export function queryUrl(tab: chrome.tabs.Tab) {
let req = { let req = {
action: ACTION_QUERY_URL action: Actions.QUERY_URL
} }
return sendMessage<string>(tab, req); return sendMessage<string>(tab, req);
} }
@ -94,22 +101,79 @@ export function queryUrl(tab: chrome.tabs.Tab) {
*/ */
export function scrollToBottom(tab: chrome.tabs.Tab) { export function scrollToBottom(tab: chrome.tabs.Tab) {
let req = { let req = {
action: ACTION_SCROLL_BOTTOM action: Actions.SCROLL_BOTTOM
} }
return sendMessage(tab, req, 'Scroll to page bottom...'); return sendMessage(tab, req, 'Scroll to page bottom...');
} }
export async function createTab(url: string, active: boolean) { export async function createTab(url: string, active: boolean) {
return new Promise((resolve, reject) => { return new Promise((resolve, reject) => {
chrome.tabs.create({ findIncognitoWindow().then(
'url': url, incognitoWindow => {
'active': active chrome.tabs.create({
}, function (tab) { 'url': url,
resolve(tab); 'active': active,
// createTab to incognito window first
'windowId': incognitoWindow ? incognitoWindow.id : undefined
}, function (tab) {
resolve(tab);
})
}
);
});
}
export async function findIncognitoWindow(): Promise<chrome.windows.Window> {
return new Promise((resolve, reject) => {
chrome.windows.getAll(
{
windowTypes: ['normal'],
},
(windows: chrome.windows.Window[]) => {
for (let window of windows) {
if (window.incognito) {
resolve(window);
return;
}
}
resolve(undefined);
}
);
});
}
export async function getCurrentWindow(): Promise<chrome.windows.Window> {
return new Promise((resolve, reject) => {
chrome.windows.getCurrent(
(windows: chrome.windows.Window) => {
return resolve(windows);
}
);
});
}
export async function getWindowByID(id: number) {
return new Promise<chrome.windows.Window>((resolve, reject) => {
chrome.windows.get(id, function (window) {
chrome.runtime.lastError;
resolve(window);
}) })
}) })
} }
export async function CreateIncognitoWindow() {
return new Promise((resolve, reject) => {
chrome.windows.create(
<chrome.windows.CreateData>{
incognito: true,
},
(window: chrome.windows.Window) => {
resolve(window);
}
);
});
}
export async function getActiveTab(currentWindow: boolean) { export async function getActiveTab(currentWindow: boolean) {
return new Promise((resolve, reject) => { return new Promise((resolve, reject) => {
chrome.tabs.query({ chrome.tabs.query({

View File

@ -1,8 +1,15 @@
import { logger } from "./common"; import { logger } from "./common";
import { Actions } from "../common";
import { messageSubscribers } from "./messaging";
export class Caches { export class Caches {
private _state: string = ""; private _state: string = "";
constructor() { } constructor() {
messageSubscribers.addListener(Actions.UPLOAD_STATE, (request, sender, sendResponse) => {
sendResponse('recieved!');
this.setState(request.fileName, request.state)
});
}
get state(): string { get state(): string {
let s = this._state; let s = this._state;
this._state = ""; this._state = "";
@ -11,5 +18,12 @@ export class Caches {
setState(name: string, content: string) { setState(name: string, content: string) {
this._state = content; this._state = content;
logger.info(`State (${name}) recieved. To load it: some_var = new Extractor().load()`); logger.info(`State (${name}) recieved. To load it: some_var = new Extractor().load()`);
// clear cache in 30 seconds
setTimeout(() => {
if (this._state) {
logger.info(`Uploaded state is cleaned after 30 second.`);
this._state = "";
}
}, 30000);
} }
} }

View File

@ -51,14 +51,30 @@ export class Extractor {
/** /**
* Start the task chain. * Start the task chain.
*/ */
async start() { start() {
return this._startTasks(0); return this._startTasks(0);
} }
stop(id?: number) {
if (id !== undefined) {
id = this._checkTaskId(id);
if (id < 0) return;
this._tasks[id].stop();
return;
}
for (let i = 0; i < this._tasks.length; i++) {
this._tasks[i].stop();
}
}
watch(id: number) {
id = this._checkTaskId(id);
if (id < 0) return;
this._tasks[id].watch();
}
/** /**
* restart from specified task, but don't restart the previous tasks. * restart from specified task, but don't restart the previous tasks.
* @param {number} from where to restart the tasks, begins with 0 * @param {number} from where to restart the tasks, begins with 0
*/ */
async restart(from: number = 0) { restart(from: number = 0) {
let id = this._checkTaskId(from, 0); let id = this._checkTaskId(from, 0);
if (id < 0) return; if (id < 0) return;
for (let i = id; i < this._tasks.length; i++) { for (let i = id; i < this._tasks.length; i++) {
@ -133,7 +149,7 @@ ${exResults.toString(50) || "- Empty -"}
saveFile(exResults.toString(), "text/csv"); saveFile(exResults.toString(), "text/csv");
} }
} }
_checkTaskId(id: number, defaultId: number) { _checkTaskId(id: number, defaultId?: number) {
if (!this._tasks.length) { if (!this._tasks.length) {
logger.info("No task found."); logger.info("No task found.");
return -1; return -1;

View File

@ -1,6 +1,6 @@
import { EXT_NAME, ACTION_UPLOAD_STATE } from "../common"; import { Request, Actions } from "../common";
import { getTabByID } from "./actions"; import { getTabByID } from "./actions";
import { caches, logger } from "./common"; import { logger } from "./common";
/** /**
* Sending a message to target tab repeatedly until the response is not undefined. * Sending a message to target tab repeatedly until the response is not undefined.
@ -31,7 +31,7 @@ export function sendMessage<T>(
loop(); loop();
async function loop() { async function loop() {
logger.debug("Request for", req.action); logger.debug("Request for", Actions[req.action]);
let tabAvailable = await getTabByID(tab.id); let tabAvailable = await getTabByID(tab.id);
if (!tabAvailable) { if (!tabAvailable) {
reject("Task interrupted due to the target tab is closed."); reject("Task interrupted due to the target tab is closed.");
@ -78,17 +78,41 @@ export function sendMessage<T>(
}); });
} }
chrome.runtime.onMessage.addListener(function (request, sender, sendResponse) { export type ActionSubscriber = (request: Request, sender: chrome.runtime.MessageSender, sendResponse: (response?: any) => void) => void | Promise<void>;
if (!request.action || !request.action.startsWith(EXT_NAME)) { class MessageSubscribers {
private listeners: { [key: number]: ActionSubscriber[] } = {};
addListener(action: Actions, subscriber: ActionSubscriber) {
this.listeners[action] || (this.listeners[action] = []);
this.listeners[action].push(subscriber);
}
removeListener(action: Actions, subscriber: ActionSubscriber) {
this.listeners[action] || (this.listeners[action] = []);
for (let i = 0; i < this.listeners[action].length; i++) {
if (this.listeners[action][i] == subscriber) {
this.listeners[action].splice(i, 1);
i--;
}
}
logger.debug(`${this.listeners[action].length} subscriber(s) remained for action ${Actions[action]}`);
}
getListeners(action: Actions): ActionSubscriber[] {
return this.listeners[action]
}
}
export const messageSubscribers = new MessageSubscribers();
chrome.runtime.onMessage.addListener(function (request: Request, sender, sendResponse) {
let subscribers = messageSubscribers.getListeners(request.action);
if (!subscribers || !subscribers.length) {
sendResponse("Request not supported.");
return; return;
} }
switch (request.action) { let promises: Promise<any>[] = [];
case ACTION_UPLOAD_STATE: for (let subscriber of subscribers) {
sendResponse('recieved!'); let p = subscriber(request, sender, sendResponse);
caches.setState(request.name, request.state) if (p instanceof Promise) promises.push(p);
break;
default:
sendResponse("Request not supported.");
break;
} }
if (promises.length)
return Promise.all(promises);
return;
}); });

View File

@ -1,7 +1,10 @@
import { parseUrls } from "./tools"; import { parseUrls } from "./tools";
import { queryUrl, redirectTab, scrollToBottom, extractTabData } from "./actions"; import { queryUrl, redirectTab, scrollToBottom, extractTabData, findIncognitoWindow, getCurrentWindow, getWindowByID } from "./actions";
import { testArgs, signitures } from "./signiture"; import { testArgs, signitures } from "./signiture";
import { ExtractResult } from "./result"; import { ExtractResult } from "./result";
import { messageSubscribers, ActionSubscriber } from "./messaging";
import { Actions } from "../common";
import { logger } from "./common";
export class Task { export class Task {
private _data: { [key: string]: string[][] } = {}; private _data: { [key: string]: string[][] } = {};
@ -10,6 +13,8 @@ export class Task {
private _itemsSelector: string; private _itemsSelector: string;
private _fieldSelectors: string[]; private _fieldSelectors: string[];
private _urls: string[] = []; private _urls: string[] = [];
private _running = false;
private _listeners: ActionSubscriber[] = [];
constructor(options: any, ...arg: any); constructor(options: any, ...arg: any);
constructor(options: any, itemsSelector: string, fieldSelectors: string[]); constructor(options: any, itemsSelector: string, fieldSelectors: string[]);
@ -45,12 +50,58 @@ export class Task {
return this._fieldSelectors; return this._fieldSelectors;
} }
clean(): Task { clean(): Task {
this.stop();
this._data = {}; this._data = {};
this._data_keys = []; this._data_keys = [];
return this; return this;
} }
stop() {
this._running = false;
let listener: ActionSubscriber;
while (listener = this._listeners.pop()) {
messageSubscribers.removeListener(Actions.REPORT_NEW_PAGE, listener);
}
}
async watch() {
if (this._running) {
logger.info("The task is running. Please wait...");
return;
}
this._running = true;
let window = await findIncognitoWindow() || await getCurrentWindow();
if (!window) {
logger.info("No window to watch...");
return;
}
let listener: ActionSubscriber = async (request, sender, sendResponse) => {
let findWindow = await getWindowByID(window.id);
if (!findWindow) {
// stop watch on window close.
messageSubscribers.removeListener(Actions.REPORT_NEW_PAGE, listener);
return;
}
// only watch current window.
if (sender.tab.windowId != window.id) return;
let pm = this.makeOptionalTasks(sender.tab);
return pm.then(
() => extractTabData(sender.tab, this._itemsSelector, this._fieldSelectors, false)
).then(
results => {
if (results && results.length) {
this.saveResult(results, sender.tab.url);
}
}
).catch(
e => logger.error(e)
)
}
this._listeners.push(listener);
messageSubscribers.addListener(Actions.REPORT_NEW_PAGE, listener);
}
async execute(tab: chrome.tabs.Tab, upstreamData?: ExtractResult): Promise<void> { async execute(tab: chrome.tabs.Tab, upstreamData?: ExtractResult): Promise<void> {
if (!tab) return Promise.reject("No tab to execute the task."); if (!tab) return Promise.reject("No tab to execute the task.");
if (this._running) return Promise.reject("The task is running. Please wait...");
this._running = true;
let urls = this._urls let urls = this._urls
if (!urls.length) { if (!urls.length) {
if (upstreamData) { if (upstreamData) {
@ -59,33 +110,53 @@ export class Task {
urls = [await queryUrl(tab)]; urls = [await queryUrl(tab)];
} }
} }
let saveResult = (results, key) => {
this._data[key] = results;
this._data_keys.push(key);
}
return urls.reduce((p, url, i) => p.then( return urls.reduce((p, url, i) => p.then(
results => { results => {
if (i > 0 && results instanceof Array) { if (i > 0 && results instanceof Array) {
let lastURL = urls[i - 1]; let lastURL = urls[i - 1];
saveResult(results, lastURL); this.saveResult(results, lastURL);
} }
if (this._data[url]) return; if (this._data[url]) return;
let pms: Promise<any> = redirectTab(tab, url);
if (this._options["scrollToBottom"]) { let pms: Promise<any> = this.runningCheck(() => redirectTab(tab, url));
pms = pms.then(() => scrollToBottom(tab)); return pms
} .then(() => this.makeOptionalTasks(tab))
return pms.then( .then(
() => extractTabData(tab, this._itemsSelector, this._fieldSelectors) () => this.runningCheck(() => extractTabData(tab, this._itemsSelector, this._fieldSelectors))
); );
} }
), Promise.resolve<string[][]>(null)).then( ), Promise.resolve<string[][]>(null)).then(
results => { results => {
if (results && results.length) { if (results && results.length) {
let lastURL = urls[urls.length - 1]; let lastURL = urls[urls.length - 1];
saveResult(results, lastURL); this.saveResult(results, lastURL);
return; this._running = false;
} }
} }
).catch(
e => {
this._running = false;
return Promise.reject(e);
}
); );
} }
private makeOptionalTasks(tab: chrome.tabs.Tab): Promise<any> {
let pm: Promise<any>;
if (this._options["scrollToBottom"]) {
pm = this.runningCheck(() => scrollToBottom(tab));
}
return pm;
}
private runningCheck(fn: () => Promise<any>): Promise<any> {
if (!this._running) return Promise.reject("The task is stopped by user.");
return fn();
}
private saveResult(results, key) {
if (this._data[key] === undefined) {
// do not add keys again
this._data_keys.push(key);
}
this._data[key] = results;
logger.info(`${results.length} items found.`)
}
} }

View File

@ -1,11 +1,23 @@
export enum Actions {
// from background to content script
EXTRACT = 1,
GOTO_URL,
PING,
QUERY_URL,
SCROLL_BOTTOM,
SLEEP,
WAKEUP,
// from popup to background script
UPLOAD_STATE,
// from content to background script
REPORT_NEW_PAGE,
}
export const EXT_NAME = "DataExtracter"; export interface Request {
action: Actions
export const ACTION_EXTRACT = `${EXT_NAME}:Extract`; itemsSelector?: string
export const ACTION_GOTO_URL = `${EXT_NAME}:GoToTUL`; fieldSelectors?: string[]
export const ACTION_PING = `${EXT_NAME}:ReportIn`; url?: string
export const ACTION_QUERY_URL = `${EXT_NAME}:QueryURL`; fileName?: string
export const ACTION_SCROLL_BOTTOM = `${EXT_NAME}:ScrollToBottom`; state?: string
export const ACTION_UPLOAD_STATE = `${EXT_NAME}:UploadStateFile`; }
export const ACTION_SLEEP = `${EXT_NAME}:Sleep`;
export const ACTION_WAKEUP = `${EXT_NAME}:WakeUp`;

View File

@ -1,11 +1,11 @@
import { ACTION_WAKEUP, ACTION_EXTRACT, ACTION_GOTO_URL, ACTION_PING, ACTION_QUERY_URL, ACTION_SCROLL_BOTTOM, ACTION_SLEEP } from '../common'; import { Request, Actions } from '../common';
import { scrollToBottom, extract } from './actions'; import { scrollToBottom, extract } from './actions';
let asleep = false; let asleep = false;
chrome.runtime.onMessage.addListener( chrome.runtime.onMessage.addListener(
function (request, sender: chrome.runtime.MessageSender, sendResponse: (r: any) => void) { function (request, sender: chrome.runtime.MessageSender, sendResponse: (r: any) => void) {
if (!request.action) return; if (!request.action) return;
if (asleep && ACTION_WAKEUP != request.action) { if (asleep && Actions.WAKEUP != request.action) {
sendResponse && sendResponse(undefined); sendResponse && sendResponse(undefined);
return; return;
} }
@ -16,26 +16,30 @@ chrome.runtime.onMessage.addListener(
} }
); );
async function doAction(request: any, sender: chrome.runtime.MessageSender) { chrome.runtime.sendMessage(<Request>{
action: Actions.REPORT_NEW_PAGE,
});
async function doAction(request: Request, sender: chrome.runtime.MessageSender) {
switch (request.action) { switch (request.action) {
case ACTION_EXTRACT: case Actions.EXTRACT:
let data = extract(request.itemsSelector, request.fieldSelectors); let data = extract(request.itemsSelector, request.fieldSelectors);
return data; return data;
case ACTION_GOTO_URL: case Actions.GOTO_URL:
window.location.replace(request.url); window.location.replace(request.url);
// should not recieve any request until the page & script reload // should not recieve any request until the page & script reload
asleep = true; asleep = true;
return request.url; return request.url;
case ACTION_PING: case Actions.PING:
return "pong"; return "pong";
case ACTION_QUERY_URL: case Actions.QUERY_URL:
return window.location.href; return window.location.href;
case ACTION_SCROLL_BOTTOM: case Actions.SCROLL_BOTTOM:
return scrollToBottom(); return scrollToBottom();
case ACTION_SLEEP: case Actions.SLEEP:
asleep = true; asleep = true;
return "Content script is sleeping."; return "Content script is sleeping.";
case ACTION_WAKEUP: case Actions.WAKEUP:
asleep = false; asleep = false;
return "Content script is available."; return "Content script is available.";
default: default:

View File

@ -1,4 +1,4 @@
import { ACTION_UPLOAD_STATE } from '../common'; import { Request, Actions } from '../common';
window.onload = function () { window.onload = function () {
document.querySelector('#link-extension-detail') document.querySelector('#link-extension-detail')
@ -21,10 +21,10 @@ window.onload = function () {
reader.readAsText(this.files[0], "UTF-8"); reader.readAsText(this.files[0], "UTF-8");
reader.onload = function (evt) { reader.onload = function (evt) {
var fileString = evt.target.result; var fileString = evt.target.result;
chrome.runtime.sendMessage({ chrome.runtime.sendMessage(<Request>{
action: ACTION_UPLOAD_STATE, action: Actions.UPLOAD_STATE,
state: fileString, state: fileString,
name: fileName fileName: fileName
}, r => { }, r => {
if (r) console.log('State sent:', r); if (r) console.log('State sent:', r);
}); });

View File

@ -27,6 +27,7 @@
], ],
"run_at": "document_idle" "run_at": "document_idle"
}], }],
"incognito": "spanning",
"permissions": [ "permissions": [
"activeTab", "activeTab",
"notifications" "notifications"