Compare commits

...

6 Commits

Author SHA1 Message Date
d82010686d Extractor.watch() improvements
- only watch current window
- stop watch on window close
- don't ask user to confirm when fails
2020-01-15 18:28:28 +08:00
7644a1363f Extractor.watch() 2020-01-15 17:53:23 +08:00
3338f78d91 code optimize 2020-01-15 15:21:17 +08:00
da7ae057f4 Extractor.stop() 2020-01-15 14:18:31 +08:00
2224db1ad1 incognito window first 2020-01-15 14:05:57 +08:00
790c95ffc3 clean state cache in 30 seconds 2020-01-14 17:03:14 +08:00
10 changed files with 298 additions and 79 deletions

View File

@ -8,15 +8,19 @@ All you need to do is:
- Find out the selectors for target data
- Type scripts in the console of `extension backgroud page`, as introduced bellow.
![](images/console.png)
![](template/assets/console.png)
## Qucik Start
Extract current page
```js
$('.item', ['a', 'a@href']);
new Extractor().task('.item', ['a', 'a@href']).start();
```
> `$(...args)` is the short form of `new Extractor().task(...args).start();`, which is introduced later.
Extract multiple pages (1-10, interval 1)
```js
@ -52,10 +56,16 @@ function (itemsSelector:string, fieldSelectors:string[], urls:ExtractResult)
## Stop Tasks
The only way to stop tasks before its finish, is `Closing the target tab`.
Close the target tab, in which current tasks is running.
> Tasks wait for their target elements' appearance, given some elements were loaded asynchronously.
> If you typed wrong selectors, the task waits forever for elements which don't exists.
Or use `job.stop()`:
```js
job = new Extractor().task('.search-list-item', ['a@href'], ["http://sample.com/abc"])
.task('list-item', ["a.title", "p.content"])
.start();
job.stop();
```
## Extract Attributes.
@ -126,17 +136,17 @@ e.export(1)
Sometimes, it's hard to finish them in an single execution, that why we need "Continuing of Tasks".
You can always continue tasks (with following), even it stops in the middle of a task:
You can always continue tasks by start it again, not matter in what phase it stops.
```js
e.start()
```
The `Extractor` kept the state of last execution, and starts from where it stopped.
The `Extractor` kept the execution state, and starts from where it stopped.
### Restart Tasks
What should I do, if I don't like to continue from last state, but restart from certain task?
What if I don't like to continue from last state, but restart certain tasks?
```js
// restart all tasks
@ -166,12 +176,15 @@ e.save();
Load the state:
Open the popup window, upload the saved state file. Then, and in the backgoud console:
Open the popup window, upload the saved state file. Then, and in the backgroud console:
```js
e = new Extractor().load();
e.start();
```
> The uploaded state will be cleaned in 30 seconds, if you don't load it.
## Developpment
Clone this project and execute:

View File

@ -1,5 +1,6 @@
import { ACTION_GOTO_URL, ACTION_EXTRACT, ACTION_PING as ACTION_PING, ACTION_QUERY_URL, ACTION_SCROLL_BOTTOM } from "../common";
import { Actions, Request } from "../common";
import { sendMessage } from "./messaging";
import { logger } from "./common";
/**
* redirect tab to url.
@ -10,8 +11,8 @@ import { sendMessage } from "./messaging";
export function redirectTab(tab: chrome.tabs.Tab, url: string) {
return queryUrl(tab).then(u => {
if (url !== u) {
let req = {
action: ACTION_GOTO_URL,
let req: Request = {
action: Actions.GOTO_URL,
url: url
}
let checker = async (u, err, tryCount): Promise<string> => {
@ -22,7 +23,7 @@ export function redirectTab(tab: chrome.tabs.Tab, url: string) {
}
if (newURL == url) return url;
if (
tryCount % 5 == 0 &&
tryCount % 1 == 0 &&
!confirm('Cannot navigate to target url. \nPress OK to continue, Cancel to stop.')
) {
return Promise.reject("Tasks stopped by user.");
@ -41,15 +42,21 @@ export function redirectTab(tab: chrome.tabs.Tab, url: string) {
* @param {Array<string>} fieldSelectors fields selectors for selecting fields (data columns) under each item
* @returns {Promise<string[]>} a promise of extracted data
*/
export function extractTabData(tab, itemsSelector, fieldSelectors) {
export function extractTabData(tab: chrome.tabs.Tab, itemsSelector: string, fieldSelectors: string[], askOnfail?: boolean) {
let req = {
action: ACTION_EXTRACT,
action: Actions.EXTRACT,
itemsSelector: itemsSelector,
fieldSelectors: fieldSelectors
}
let checker = (result, err, tryCount) => {
if (!result || !result.length) {
if (tryCount % 20 == 0 && confirm('No data found in current page. \n\nContinue to next page?')) {
if (
tryCount % 20 == 0 && (
!askOnfail ||
confirm('No data found in current page. \n\nContinue to next page?')
)
) {
logger.warn(`Failed after ${tryCount} tries: ${tab.url}`)
return [];
} else {
return undefined;
@ -67,7 +74,7 @@ export function extractTabData(tab, itemsSelector, fieldSelectors) {
*/
export async function ping(tab, count = 1) {
let req = {
action: ACTION_PING
action: Actions.PING
}
let checker = (r: string, e, c) => r == "pong" ? r : undefined;
let pong = await sendMessage<string>(tab, req, 'Check tab availability...', checker, 1000, count).catch(() => { });
@ -81,7 +88,7 @@ export async function ping(tab, count = 1) {
*/
export function queryUrl(tab: chrome.tabs.Tab) {
let req = {
action: ACTION_QUERY_URL
action: Actions.QUERY_URL
}
return sendMessage<string>(tab, req);
}
@ -94,22 +101,79 @@ export function queryUrl(tab: chrome.tabs.Tab) {
*/
export function scrollToBottom(tab: chrome.tabs.Tab) {
let req = {
action: ACTION_SCROLL_BOTTOM
action: Actions.SCROLL_BOTTOM
}
return sendMessage(tab, req, 'Scroll to page bottom...');
}
export async function createTab(url: string, active: boolean) {
return new Promise((resolve, reject) => {
chrome.tabs.create({
'url': url,
'active': active
}, function (tab) {
resolve(tab);
findIncognitoWindow().then(
incognitoWindow => {
chrome.tabs.create({
'url': url,
'active': active,
// createTab to incognito window first
'windowId': incognitoWindow ? incognitoWindow.id : undefined
}, function (tab) {
resolve(tab);
})
}
);
});
}
export async function findIncognitoWindow(): Promise<chrome.windows.Window> {
return new Promise((resolve, reject) => {
chrome.windows.getAll(
{
windowTypes: ['normal'],
},
(windows: chrome.windows.Window[]) => {
for (let window of windows) {
if (window.incognito) {
resolve(window);
return;
}
}
resolve(undefined);
}
);
});
}
export async function getCurrentWindow(): Promise<chrome.windows.Window> {
return new Promise((resolve, reject) => {
chrome.windows.getCurrent(
(windows: chrome.windows.Window) => {
return resolve(windows);
}
);
});
}
export async function getWindowByID(id: number) {
return new Promise<chrome.windows.Window>((resolve, reject) => {
chrome.windows.get(id, function (window) {
chrome.runtime.lastError;
resolve(window);
})
})
}
export async function CreateIncognitoWindow() {
return new Promise((resolve, reject) => {
chrome.windows.create(
<chrome.windows.CreateData>{
incognito: true,
},
(window: chrome.windows.Window) => {
resolve(window);
}
);
});
}
export async function getActiveTab(currentWindow: boolean) {
return new Promise((resolve, reject) => {
chrome.tabs.query({

View File

@ -1,8 +1,15 @@
import { logger } from "./common";
import { Actions } from "../common";
import { messageSubscribers } from "./messaging";
export class Caches {
private _state: string = "";
constructor() { }
constructor() {
messageSubscribers.addListener(Actions.UPLOAD_STATE, (request, sender, sendResponse) => {
sendResponse('recieved!');
this.setState(request.fileName, request.state)
});
}
get state(): string {
let s = this._state;
this._state = "";
@ -11,5 +18,12 @@ export class Caches {
setState(name: string, content: string) {
this._state = content;
logger.info(`State (${name}) recieved. To load it: some_var = new Extractor().load()`);
// clear cache in 30 seconds
setTimeout(() => {
if (this._state) {
logger.info(`Uploaded state is cleaned after 30 second.`);
this._state = "";
}
}, 30000);
}
}

View File

@ -51,14 +51,30 @@ export class Extractor {
/**
* Start the task chain.
*/
async start() {
start() {
return this._startTasks(0);
}
stop(id?: number) {
if (id !== undefined) {
id = this._checkTaskId(id);
if (id < 0) return;
this._tasks[id].stop();
return;
}
for (let i = 0; i < this._tasks.length; i++) {
this._tasks[i].stop();
}
}
watch(id: number) {
id = this._checkTaskId(id);
if (id < 0) return;
this._tasks[id].watch();
}
/**
* restart from specified task, but don't restart the previous tasks.
* @param {number} from where to restart the tasks, begins with 0
*/
async restart(from: number = 0) {
restart(from: number = 0) {
let id = this._checkTaskId(from, 0);
if (id < 0) return;
for (let i = id; i < this._tasks.length; i++) {
@ -133,7 +149,7 @@ ${exResults.toString(50) || "- Empty -"}
saveFile(exResults.toString(), "text/csv");
}
}
_checkTaskId(id: number, defaultId: number) {
_checkTaskId(id: number, defaultId?: number) {
if (!this._tasks.length) {
logger.info("No task found.");
return -1;

View File

@ -1,6 +1,6 @@
import { EXT_NAME, ACTION_UPLOAD_STATE } from "../common";
import { Request, Actions } from "../common";
import { getTabByID } from "./actions";
import { caches, logger } from "./common";
import { logger } from "./common";
/**
* Sending a message to target tab repeatedly until the response is not undefined.
@ -31,7 +31,7 @@ export function sendMessage<T>(
loop();
async function loop() {
logger.debug("Request for", req.action);
logger.debug("Request for", Actions[req.action]);
let tabAvailable = await getTabByID(tab.id);
if (!tabAvailable) {
reject("Task interrupted due to the target tab is closed.");
@ -78,17 +78,41 @@ export function sendMessage<T>(
});
}
chrome.runtime.onMessage.addListener(function (request, sender, sendResponse) {
if (!request.action || !request.action.startsWith(EXT_NAME)) {
export type ActionSubscriber = (request: Request, sender: chrome.runtime.MessageSender, sendResponse: (response?: any) => void) => void | Promise<void>;
class MessageSubscribers {
private listeners: { [key: number]: ActionSubscriber[] } = {};
addListener(action: Actions, subscriber: ActionSubscriber) {
this.listeners[action] || (this.listeners[action] = []);
this.listeners[action].push(subscriber);
}
removeListener(action: Actions, subscriber: ActionSubscriber) {
this.listeners[action] || (this.listeners[action] = []);
for (let i = 0; i < this.listeners[action].length; i++) {
if (this.listeners[action][i] == subscriber) {
this.listeners[action].splice(i, 1);
i--;
}
}
logger.debug(`${this.listeners[action].length} subscriber(s) remained for action ${Actions[action]}`);
}
getListeners(action: Actions): ActionSubscriber[] {
return this.listeners[action]
}
}
export const messageSubscribers = new MessageSubscribers();
chrome.runtime.onMessage.addListener(function (request: Request, sender, sendResponse) {
let subscribers = messageSubscribers.getListeners(request.action);
if (!subscribers || !subscribers.length) {
sendResponse("Request not supported.");
return;
}
switch (request.action) {
case ACTION_UPLOAD_STATE:
sendResponse('recieved!');
caches.setState(request.name, request.state)
break;
default:
sendResponse("Request not supported.");
break;
let promises: Promise<any>[] = [];
for (let subscriber of subscribers) {
let p = subscriber(request, sender, sendResponse);
if (p instanceof Promise) promises.push(p);
}
if (promises.length)
return Promise.all(promises);
return;
});

View File

@ -1,7 +1,10 @@
import { parseUrls } from "./tools";
import { queryUrl, redirectTab, scrollToBottom, extractTabData } from "./actions";
import { queryUrl, redirectTab, scrollToBottom, extractTabData, findIncognitoWindow, getCurrentWindow, getWindowByID } from "./actions";
import { testArgs, signitures } from "./signiture";
import { ExtractResult } from "./result";
import { messageSubscribers, ActionSubscriber } from "./messaging";
import { Actions } from "../common";
import { logger } from "./common";
export class Task {
private _data: { [key: string]: string[][] } = {};
@ -10,6 +13,8 @@ export class Task {
private _itemsSelector: string;
private _fieldSelectors: string[];
private _urls: string[] = [];
private _running = false;
private _listeners: ActionSubscriber[] = [];
constructor(options: any, ...arg: any);
constructor(options: any, itemsSelector: string, fieldSelectors: string[]);
@ -45,12 +50,58 @@ export class Task {
return this._fieldSelectors;
}
clean(): Task {
this.stop();
this._data = {};
this._data_keys = [];
return this;
}
stop() {
this._running = false;
let listener: ActionSubscriber;
while (listener = this._listeners.pop()) {
messageSubscribers.removeListener(Actions.REPORT_NEW_PAGE, listener);
}
}
async watch() {
if (this._running) {
logger.info("The task is running. Please wait...");
return;
}
this._running = true;
let window = await findIncognitoWindow() || await getCurrentWindow();
if (!window) {
logger.info("No window to watch...");
return;
}
let listener: ActionSubscriber = async (request, sender, sendResponse) => {
let findWindow = await getWindowByID(window.id);
if (!findWindow) {
// stop watch on window close.
messageSubscribers.removeListener(Actions.REPORT_NEW_PAGE, listener);
return;
}
// only watch current window.
if (sender.tab.windowId != window.id) return;
let pm = this.makeOptionalTasks(sender.tab);
return pm.then(
() => extractTabData(sender.tab, this._itemsSelector, this._fieldSelectors, false)
).then(
results => {
if (results && results.length) {
this.saveResult(results, sender.tab.url);
}
}
).catch(
e => logger.error(e)
)
}
this._listeners.push(listener);
messageSubscribers.addListener(Actions.REPORT_NEW_PAGE, listener);
}
async execute(tab: chrome.tabs.Tab, upstreamData?: ExtractResult): Promise<void> {
if (!tab) return Promise.reject("No tab to execute the task.");
if (this._running) return Promise.reject("The task is running. Please wait...");
this._running = true;
let urls = this._urls
if (!urls.length) {
if (upstreamData) {
@ -59,33 +110,53 @@ export class Task {
urls = [await queryUrl(tab)];
}
}
let saveResult = (results, key) => {
this._data[key] = results;
this._data_keys.push(key);
}
return urls.reduce((p, url, i) => p.then(
results => {
if (i > 0 && results instanceof Array) {
let lastURL = urls[i - 1];
saveResult(results, lastURL);
this.saveResult(results, lastURL);
}
if (this._data[url]) return;
let pms: Promise<any> = redirectTab(tab, url);
if (this._options["scrollToBottom"]) {
pms = pms.then(() => scrollToBottom(tab));
}
return pms.then(
() => extractTabData(tab, this._itemsSelector, this._fieldSelectors)
);
let pms: Promise<any> = this.runningCheck(() => redirectTab(tab, url));
return pms
.then(() => this.makeOptionalTasks(tab))
.then(
() => this.runningCheck(() => extractTabData(tab, this._itemsSelector, this._fieldSelectors))
);
}
), Promise.resolve<string[][]>(null)).then(
results => {
if (results && results.length) {
let lastURL = urls[urls.length - 1];
saveResult(results, lastURL);
return;
this.saveResult(results, lastURL);
this._running = false;
}
}
).catch(
e => {
this._running = false;
return Promise.reject(e);
}
);
}
private makeOptionalTasks(tab: chrome.tabs.Tab): Promise<any> {
let pm: Promise<any>;
if (this._options["scrollToBottom"]) {
pm = this.runningCheck(() => scrollToBottom(tab));
}
return pm;
}
private runningCheck(fn: () => Promise<any>): Promise<any> {
if (!this._running) return Promise.reject("The task is stopped by user.");
return fn();
}
private saveResult(results, key) {
if (this._data[key] === undefined) {
// do not add keys again
this._data_keys.push(key);
}
this._data[key] = results;
logger.info(`${results.length} items found.`)
}
}

View File

@ -1,11 +1,23 @@
export enum Actions {
// from background to content script
EXTRACT = 1,
GOTO_URL,
PING,
QUERY_URL,
SCROLL_BOTTOM,
SLEEP,
WAKEUP,
// from popup to background script
UPLOAD_STATE,
// from content to background script
REPORT_NEW_PAGE,
}
export const EXT_NAME = "DataExtracter";
export const ACTION_EXTRACT = `${EXT_NAME}:Extract`;
export const ACTION_GOTO_URL = `${EXT_NAME}:GoToTUL`;
export const ACTION_PING = `${EXT_NAME}:ReportIn`;
export const ACTION_QUERY_URL = `${EXT_NAME}:QueryURL`;
export const ACTION_SCROLL_BOTTOM = `${EXT_NAME}:ScrollToBottom`;
export const ACTION_UPLOAD_STATE = `${EXT_NAME}:UploadStateFile`;
export const ACTION_SLEEP = `${EXT_NAME}:Sleep`;
export const ACTION_WAKEUP = `${EXT_NAME}:WakeUp`;
export interface Request {
action: Actions
itemsSelector?: string
fieldSelectors?: string[]
url?: string
fileName?: string
state?: string
}

View File

@ -1,11 +1,11 @@
import { ACTION_WAKEUP, ACTION_EXTRACT, ACTION_GOTO_URL, ACTION_PING, ACTION_QUERY_URL, ACTION_SCROLL_BOTTOM, ACTION_SLEEP } from '../common';
import { Request, Actions } from '../common';
import { scrollToBottom, extract } from './actions';
let asleep = false;
chrome.runtime.onMessage.addListener(
function (request, sender: chrome.runtime.MessageSender, sendResponse: (r: any) => void) {
if (!request.action) return;
if (asleep && ACTION_WAKEUP != request.action) {
if (asleep && Actions.WAKEUP != request.action) {
sendResponse && sendResponse(undefined);
return;
}
@ -16,26 +16,30 @@ chrome.runtime.onMessage.addListener(
}
);
async function doAction(request: any, sender: chrome.runtime.MessageSender) {
chrome.runtime.sendMessage(<Request>{
action: Actions.REPORT_NEW_PAGE,
});
async function doAction(request: Request, sender: chrome.runtime.MessageSender) {
switch (request.action) {
case ACTION_EXTRACT:
case Actions.EXTRACT:
let data = extract(request.itemsSelector, request.fieldSelectors);
return data;
case ACTION_GOTO_URL:
case Actions.GOTO_URL:
window.location.replace(request.url);
// should not recieve any request until the page & script reload
asleep = true;
return request.url;
case ACTION_PING:
case Actions.PING:
return "pong";
case ACTION_QUERY_URL:
case Actions.QUERY_URL:
return window.location.href;
case ACTION_SCROLL_BOTTOM:
case Actions.SCROLL_BOTTOM:
return scrollToBottom();
case ACTION_SLEEP:
case Actions.SLEEP:
asleep = true;
return "Content script is sleeping.";
case ACTION_WAKEUP:
case Actions.WAKEUP:
asleep = false;
return "Content script is available.";
default:

View File

@ -1,4 +1,4 @@
import { ACTION_UPLOAD_STATE } from '../common';
import { Request, Actions } from '../common';
window.onload = function () {
document.querySelector('#link-extension-detail')
@ -21,10 +21,10 @@ window.onload = function () {
reader.readAsText(this.files[0], "UTF-8");
reader.onload = function (evt) {
var fileString = evt.target.result;
chrome.runtime.sendMessage({
action: ACTION_UPLOAD_STATE,
chrome.runtime.sendMessage(<Request>{
action: Actions.UPLOAD_STATE,
state: fileString,
name: fileName
fileName: fileName
}, r => {
if (r) console.log('State sent:', r);
});

View File

@ -27,6 +27,7 @@
],
"run_at": "document_idle"
}],
"incognito": "spanning",
"permissions": [
"activeTab",
"notifications"