Compare commits

...

16 Commits

Author SHA1 Message Date
e87e7010ec improvements
* chance to continue on mismatch url for redirectTab
* support empty field selectors
* add Extractor.results()
* add ExtractResult.walk(), ExtractResult.visit()
* add ! directive to click elements
* code optimize
2021-04-20 14:20:05 +08:00
108ebb835f fix task running state 2021-04-20 12:01:10 +08:00
e0b0a5e986 add timeout for messaging 2021-04-20 12:00:59 +08:00
9cd25e3c1d update url 2021-04-19 15:58:04 +08:00
7827d385bd refactor 2020-06-16 14:45:36 +08:00
ade0670415 update readme 2020-01-17 11:01:13 +08:00
63aec616b1 code optimize 2020-01-17 09:38:40 +08:00
378883b626 check url change before extract data 2020-01-16 15:11:49 +08:00
c78f593c70 code optimize 2020-01-16 09:59:19 +08:00
d82010686d Extractor.watch() improvements
- only watch current window
- stop watch on window close
- don't ask user to confirm when fails
2020-01-15 18:28:28 +08:00
7644a1363f Extractor.watch() 2020-01-15 17:53:23 +08:00
3338f78d91 code optimize 2020-01-15 15:21:17 +08:00
da7ae057f4 Extractor.stop() 2020-01-15 14:18:31 +08:00
2224db1ad1 incognito window first 2020-01-15 14:05:57 +08:00
790c95ffc3 clean state cache in 30 seconds 2020-01-14 17:03:14 +08:00
f06a6f4e78 migrate to typescript, with fixes 2020-01-14 16:37:50 +08:00
35 changed files with 5792 additions and 666 deletions

154
.gitignore vendored
View File

@ -1,2 +1,154 @@
# File created using '.gitignore Generator' for Visual Studio Code: https://bit.ly/vscode-gig
# Created by https://www.gitignore.io/api/visualstudiocode,macos,node
# Edit at https://www.gitignore.io/?templates=visualstudiocode,macos,node
### macOS ###
# General
.DS_Store .DS_Store
Thumbs.db .AppleDouble
.LSOverride
# Icon must end with two \r
Icon
# Thumbnails
._*
# Files that might appear in the root of a volume
.DocumentRevisions-V100
.fseventsd
.Spotlight-V100
.TemporaryItems
.Trashes
.VolumeIcon.icns
.com.apple.timemachine.donotpresent
# Directories potentially created on remote AFP share
.AppleDB
.AppleDesktop
Network Trash Folder
Temporary Items
.apdisk
### Node ###
# Logs
logs
*.log
npm-debug.log*
yarn-debug.log*
yarn-error.log*
lerna-debug.log*
# Diagnostic reports (https://nodejs.org/api/report.html)
report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json
# Runtime data
pids
*.pid
*.seed
*.pid.lock
# Directory for instrumented libs generated by jscoverage/JSCover
lib-cov
# Coverage directory used by tools like istanbul
coverage
*.lcov
# nyc test coverage
.nyc_output
# Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files)
.grunt
# Bower dependency directory (https://bower.io/)
bower_components
# node-waf configuration
.lock-wscript
# Compiled binary addons (https://nodejs.org/api/addons.html)
build/Release
# Dependency directories
node_modules/
jspm_packages/
# TypeScript v1 declaration files
typings/
# TypeScript cache
*.tsbuildinfo
# Optional npm cache directory
.npm
# Optional eslint cache
.eslintcache
# Optional REPL history
.node_repl_history
# Output of 'npm pack'
*.tgz
# Yarn Integrity file
.yarn-integrity
# dotenv environment variables file
.env
.env.test
# parcel-bundler cache (https://parceljs.org/)
.cache
# next.js build output
.next
# nuxt.js build output
.nuxt
# rollup.js default build output
dist/
# Uncomment the public line if your project uses Gatsby
# https://nextjs.org/blog/next-9-1#public-directory-support
# https://create-react-app.dev/docs/using-the-public-folder/#docsNav
# public
# Storybook build outputs
.out
.storybook-out
# vuepress build output
.vuepress/dist
# Serverless directories
.serverless/
# FuseBox cache
.fusebox/
# DynamoDB Local files
.dynamodb/
# Temporary folders
tmp/
temp/
### VisualStudioCode ###
.vscode/*
!.vscode/settings.json
!.vscode/tasks.json
!.vscode/launch.json
!.vscode/extensions.json
### VisualStudioCode Patch ###
# Ignore all local history of files
.history
# End of https://www.gitignore.io/api/visualstudiocode,macos,node
# Custom rules (everything added below won't be overriden by 'Generate .gitignore File' if you use 'Update' option)

Binary file not shown.

Before

Width:  |  Height:  |  Size: 24 KiB

View File

@ -1,45 +0,0 @@
{
"manifest_version": 2,
"name": "Data Extracter",
"version": "0.5.0",
"author": "jebbs",
"description": "Extract data from web page elements as sheet.",
"icons": {
"16": "icon.png",
"48": "icon.png",
"128": "icon.png"
},
"browser_action": {
"default_icon": "icon.png",
"default_popup": "popup/tip.html",
"default_title": "Data Extracter"
},
"background": {
"scripts": [
"scripts/shared/tools.js",
"scripts/shared/common.js",
"scripts/background/logger.js",
"scripts/background/messaging.js",
"scripts/background/result.js",
"scripts/background/signiture.js",
"scripts/background/actions.js",
"scripts/background/task.js",
"scripts/background/extractor.js",
"scripts/background/helpers.js"
],
"persistent": false
},
"content_scripts": [{
"matches": ["*://*/*"],
"js": [
"scripts/shared/tools.js",
"scripts/shared/common.js",
"scripts/content/content.js"
],
"run_at": "document_idle"
}],
"permissions": [
"activeTab",
"notifications"
]
}

4433
package-lock.json generated Normal file

File diff suppressed because it is too large Load Diff

17
package.json Normal file
View File

@ -0,0 +1,17 @@
{
"name": "data-extractor",
"scripts": {
"dev": "webpack --mode=development --devtool=inline-source-map --watch",
"prod": "webpack --mode=production"
},
"devDependencies": {
"@types/chrome": "0.0.91",
"@types/node": "^13.1.6",
"copy-webpack-plugin": "^5.1.1",
"ts-loader": "^6.2.1",
"tslint": "^5.20.1",
"typescript": "^3.7.4",
"webpack": "^4.41.5",
"webpack-cli": "^3.3.10"
}
}

View File

@ -8,15 +8,21 @@ All you need to do is:
- Find out the selectors for target data - Find out the selectors for target data
- Type scripts in the console of `extension backgroud page`, as introduced bellow. - Type scripts in the console of `extension backgroud page`, as introduced bellow.
![](images/console.png) ![](template/assets/console.png)
## Qucik Start ## Qucik Start
Extract current page Extract current page
```js ```js
$('.item', ['a', 'a@href']); $('.item', ['a', 'a@href']);
new Extractor().task('.item', ['a', 'a@href']).start();
// fieldSelectors can be empty strings if items have no child to select
new Extractor().task('.item a', ['', '@href']).start();
``` ```
> `$(...args)` is the short form of `new Extractor().task(...args).start();`, which is introduced later.
Extract multiple pages (1-10, interval 1) Extract multiple pages (1-10, interval 1)
```js ```js
@ -52,12 +58,20 @@ function (itemsSelector:string, fieldSelectors:string[], urls:ExtractResult)
## Stop Tasks ## Stop Tasks
The only way to stop tasks before its finish, is `Closing the target tab`. Close the target tab, in which current tasks is running.
> Tasks wait for their target elements' appearance, given some elements were loaded asynchronously. Or use `job.stop()`:
> If you typed wrong selectors, the task waits forever for elements which don't exists.
## Extract Attributes. ```js
job = new Extractor().task('.search-list-item', ['a@href'], ["http://sample.com/abc"])
.task('list-item', ["a.title", "p.content"])
.start();
job.stop();
```
> Next time you call `job.start();`, the job will continues from where it stopped.
## Extract Attributes
e.g.: link text and target (use 'selector@attribute') e.g.: link text and target (use 'selector@attribute')
@ -65,6 +79,14 @@ e.g.: link text and target (use 'selector@attribute')
new Extractor().task('.item', ['a', 'a@href']).start(); new Extractor().task('.item', ['a', 'a@href']).start();
``` ```
## Click Selected Elements
The following clicks selected links and extracts link `text` and `href`
```js
new Extractor().task('.item', ['!a', 'a@href']).start();
```
## Advanced Usage ## Advanced Usage
### Use Task Chain. ### Use Task Chain.
@ -126,17 +148,17 @@ e.export(1)
Sometimes, it's hard to finish them in an single execution, that why we need "Continuing of Tasks". Sometimes, it's hard to finish them in an single execution, that why we need "Continuing of Tasks".
You can always continue tasks (with following), even it stops in the middle of a task: You can always continue tasks by start it again, not matter in what phase it stops.
```js ```js
e.start() e.start()
``` ```
The `Extractor` kept the state of last execution, and starts from where it stopped. The `Extractor` kept the execution state, and starts from where it stopped.
### Restart Tasks ### Restart Tasks
What should I do, if I don't like to continue from last state, but restart from certain task? What if I don't like to continue from last state, but restart certain tasks?
```js ```js
// restart all tasks // restart all tasks
@ -166,8 +188,59 @@ e.save();
Load the state: Load the state:
Open the popup window, upload the saved state file. Then, and in the backgoud console: Open the popup window, upload the saved state file. Then, and in the backgroud console:
```js ```js
e = new Extractor().load(); e = new Extractor().load();
e.start();
```
> The uploaded state will be cleaned in 30 seconds, if you don't load it.
## Watch Mode
Watch mode tries to exract data from every page you visit **in current window**.
```js
e = new Extractor();
e.task('.search-list-item', ['a@href'], ["http://sample.com/abc"])
.task('list-item', ["a.title", "p.content"]);
e.watch(1); // start watching for first task
```
To stop watching, you can either `close current window`, or:
```js
e.stop();
```
## Results Operation
To get the results of a task:
```js
let results = job.results(0);
```
Visit URLs (if any) in the results one by one:
```js
results.visit();
```
Walk through all results one by one:
```js
results.walk((row,col,value)=>{console.log(value)});
```
## Developpment
Clone this project and execute:
```sh
npm i
npm run prod
# or
npm run dev
``` ```

View File

@ -1,147 +0,0 @@
function parseUrls(...args) {
if (!args.length) return [];
let arg = args.shift();
if (arg instanceof Array) {
return arg;
} else if (arg instanceof ExtractResult) {
return arg.squash().filter(v => URL_REG.test(v));
} else {
let urlTempl = arg;
if (urlTempl) {
if (args[0] instanceof Array) {
return args[0].map(p => urlTempl.replace("${page}", p));
} else if (args.length >= 3) {
let urls = [];
let from = args.shift();
let to = args.shift();
let interval = args.shift();
for (let i = from; i <= to; i += interval) {
urls.push(urlTempl.replace("${page}", i));
}
return urls;
}
}
}
return [];
}
function redirectTab(tab, url) {
return queryUrl(tab).then(u => {
if (url !== u) {
let req = {
action: ACTION_GOTO_URL,
url: url
}
let checker = async (url, err, tryCount) => {
let newURL = await queryUrl(tab).catch(() => { });
if (newURL == url) return url;
if (
tryCount % 5 == 0 &&
!confirm('Cannot navigate to target url. \nPress OK to continue, Cancel to stop.')
) {
return MSG_USER_ABORT;
}
return undefined;
}
return sendMessage(tab, req, `Goto url: ${url}`, checker);
}
});
}
/**
* extract data in from the target tab.
* @param {any} tab target tab
* @param {string} itemsSelector items selectors for selecting items (data rows)
* @param {Array<string>} fieldSelectors fields selectors for selecting fields (data columns) under each item
* @returns {Promise<string[]>} a promise of extracted data
*/
function extractTabData(tab, itemsSelector, fieldSelectors) {
let req = {
action: ACTION_EXTRACT,
itemsSelector: itemsSelector,
fieldSelectors: fieldSelectors
}
let checker = (result, err, tryCount) => {
if (MSG_ELEMENT_NOT_FOUND.isEqual(result)) {
if (tryCount % 20 == 0) {
if (confirm('No data found in current page. \n\nContinue to next page?')) {
return [];
}
} else {
return undefined;
}
}
return result;
};
return sendMessage(tab, req, 'Extract data from the tab...', checker);
}
/**
* ping target tab, usually used to detect if the content script is ready.
* @param {any} tab target tab
* @returns {Promise<boolean>} a promise of boolean value indicates if ping success
*/
async function ping(tab, count = 1) {
let req = {
action: ACTION_REPORT_IN
}
let checker = r => r == req.action ? req.action : undefined;
let pong = await sendMessage(tab, req, 'Check tab availability...', checker, 1000, count).catch(() => { });
return pong == ACTION_REPORT_IN;
}
/**
* get the url of the target tab
* @param {any} tab target tab
* @returns {Promise<string>} a promise of the url
*/
function queryUrl(tab) {
let req = {
action: ACTION_QUERY_URL
}
return sendMessage(tab, req);
}
/**
* get the url of the target tab
* @param {any} tab target tab
* @param {string} expected if specified, queryUrl resolves only when tab url equals to expected
* @returns {Promise<string>} a promise of the url
*/
function scrollToBottom(tab) {
let req = {
action: ACTION_SCROLL_BOTTOM
}
return sendMessage(tab, req, 'Scroll to page bottom...');
}
async function createTab(url, active) {
return new Promise((resolve, reject) => {
chrome.tabs.create({
'url': url,
'active': active
}, function (tab) {
resolve(tab);
})
})
}
async function getActiveTab(currentWindow) {
return new Promise((resolve, reject) => {
chrome.tabs.query({
active: true,
currentWindow: currentWindow
}, function (tabs) {
resolve(tabs[0]);
})
})
}
async function getTabByID(id) {
return new Promise((resolve, reject) => {
chrome.tabs.get(id, function (tab) {
chrome.runtime.lastError;
resolve(tab);
})
})
}

View File

@ -1,3 +0,0 @@
function $(...args) {
return new Extractor().task(...args).start();
}

View File

@ -1,76 +0,0 @@
/**
* Sending a message to target tab repeatedly until the response is not undefined.
* @param {object} tab the table where to send the message
* @param {object} req the request data.
* @param {function} dataChecker (reulst:any, err:error, tryCount:number) => any.
* Check and decide what value finally returns.
* Return undefined to make 'sendMessage' retry.
* Return MSG_USER_ABORT to cancel this promise.
* @param {number} interval retry interval, default: 500ms.
* @param {number} limit retry limit, default: 0, no limit.
* @param {string} log messages logged to console.
* @return {Promise} a promise of the response.
*/
function sendMessage(tab, req, log, dataChecker, interval, limit = 0) {
interval = interval || 500;
limit = limit && !isNaN(limit) ? limit : 0;
let count = 0;
return new Promise((resolve, reject) => {
loop();
async function loop() {
logger.debug("Request for", req.action);
let tabAvailable = await getTabByID(tab.id);
if (!tabAvailable) {
reject("Task interrupted due to the target tab is closed.");
return;
}
if (limit && count >= limit) {
reject(`sendMessage loop limit ${limit} reached.`);
return;
}
count++;
chrome.tabs.sendMessage(tab.id, req, async r => {
// check error but do nothing.
// do not interrupt promise chains even if error, or the task always fail when:
// a tab is newly created, and the content scripts won't have time to initialize
let err = chrome.runtime.lastError;
let result = r;
if (dataChecker) {
result = await dataChecker(r, err, count);
if (MSG_USER_ABORT.isEqual(result)) {
reject(MSG_USER_ABORT.message);
}
}
let flag = result !== undefined && result !== null;
if (log) logger.info(log, flag ? '(OK)' : '(failed)');
if (flag) {
resolve(result);
} else {
setTimeout(() => {
loop();
}, interval);
}
});
}
});
}
chrome.runtime.onMessage.addListener(function (request, sender, sendResponse) {
if (!request.action || !request.action.startsWith(EXT_NAME)) {
return;
}
switch (request.action) {
case ACTION_UPLOAD_STATE:
sendResponse('recieved!');
__EXTRACTOR_STATE__ = request.state;
logger.info(`State (${request.name}) recieved. To load it: some_var = new Extractor().load()`);
break;
default:
sendResponse("Request not supported.");
break;
}
});

View File

@ -1,37 +0,0 @@
class ExtractResult {
constructor(data) {
this._data = data || [];
}
row(index) {
return this._data[index];
}
column(index) {
return [...new Array(this._data.length).keys()].map(
i => this._data[i][index]
);
}
squash() {
return this._data.reduce((p, c) => p.concat(c), []);
}
get data() {
return this._data;
}
toString(rowsCount) {
let data = rowsCount > 0 ? this._data.slice(0, rowsCount) : this._data;
return data.slice().reduce(
(csv, lineCells) => {
if (!lineCells || !lineCells.length) {
return csv + "\n";
}
let line = lineCells.reduce(
(lineText, cell, idx) => {
cell = '"' + cell.trim().replace(/"/g, '""') + '"';
return lineText + cell + (idx == lineCells.length - 1 ? "" : ",")
}, "");
return csv + line + "\n";
},
""
);
}
}

View File

@ -1,88 +0,0 @@
class Task {
_data = {};
_data_keys = [];
/**
* Create a task.
* constructor(itemsSelector:string, fieldSelectors:string[])
* constructor(itemsSelector:string, fieldSelectors:string[], url:string, from:number, to:number, interval:number)
* constructor(itemsSelector:string, fieldSelectors:string[], url:string, pages:number[])
* constructor(itemsSelector:string, fieldSelectors:string[], urls:string[])
* @param {...any} args
*/
constructor(options, ...args) {
if (!testArgs(...args))
throw new Error(`Invalid call arguments.\n\n${signitures}\n\n`);
this._options = options;
this._itemsSelector = args.shift();
this._fieldSelectors = args.shift();
this._urls = parseUrls(...args);
}
load(state) {
this._itemsSelector = state._itemsSelector;
this._data = state._data;
this._data_keys = state._data_keys;
this._itemsSelector = state._itemsSelector;
this._fieldSelectors = state._fieldSelectors;
this._urls = state._urls;
return this;
}
get urls() {
return this._urls;
}
get data() {
return this._data;
}
get results() {
return this._data_keys.reduce((p, c) => {
return p.concat(this._data[c]);
}, []);
}
get fieldSelectors() {
return this._fieldSelectors;
}
clean() {
this._data = {};
this._data_keys = [];
}
async execute(tab, upstreamData) {
if (!tab) return Promise.reject("No tab to execute the task.");
let urls = this._urls
if (!urls.length) {
if (upstreamData) {
urls = parseUrls(upstreamData);
} else {
urls = [await queryUrl(tab)];
}
}
let saveResult = (results, key) => {
this._data[key] = results;
this._data_keys.push(key);
}
return urls.reduce((p, url, i) => p.then(
results => {
if (i > 0) {
if (!MSG_URL_SKIPPED.isEqual(results)) {
let lastURL = urls[i - 1];
saveResult(results, lastURL);
}
}
if (this._data[url]) return MSG_URL_SKIPPED;
let pms = redirectTab(tab, url);
if (this._options["scrollToBottom"]) {
pms = pms.then(() => scrollToBottom(tab));
}
return pms.then(
() => extractTabData(tab, this._itemsSelector, this._fieldSelectors)
);
}
), Promise.resolve(null)).then(
results => {
if (!MSG_URL_SKIPPED.isEqual(results)) {
let lastURL = urls[urls.length - 1];
saveResult(results, lastURL);
return;
}
}
);
}
}

View File

@ -1,114 +0,0 @@
(function () {
let asleep = false;
chrome.runtime.onMessage.addListener(
function (request, sender, sendResponse) {
if (!request.action) return;
if (asleep && ACTION_WAKEUP != request.action) {
sendResponse && sendResponse(undefined);
return;
}
// console.log("Recieved request:",request);
doAction(request, sender).then(r => sendResponse && sendResponse(r));
// return true to indicate you wish to send a response asynchronously
return true;
}
);
async function doAction(request, sender) {
switch (request.action) {
case ACTION_EXTRACT:
let data = extract(request.itemsSelector, request.fieldSelectors);
return data;
case ACTION_GOTO_URL:
window.location.replace(request.url);
// should not recieve any request until the page & script reload
asleep = true;
return request.url;
case ACTION_REPORT_IN:
return request.action;
case ACTION_QUERY_URL:
return window.location.href;
case ACTION_SCROLL_BOTTOM:
return executeUntil(
() => window.scrollTo(0, document.body.clientHeight),
() => document.body.clientHeight - window.scrollY - window.innerHeight < 20,
"Scroll to page bottom...",
1000,
10
)
case ACTION_SLEEP:
asleep = true;
return "Content script is sleeping.";
case ACTION_WAKEUP:
asleep = false;
return "Content script is available.";
default:
break;
}
}
function extract(itemsSelector, fieldSelectors) {
// since some elements may be loaded asynchronously.
// if one field is never found, we should return undefined,
// so that senders can detect to retry until elements loaded.
// If user writes wrong selectors, the task retries infinitely.
let fieldFound = {};
let items = Array.from(document.querySelectorAll(itemsSelector));
// items may not loaded yet, tell the sender to retry.
if (!items.length) return MSG_ELEMENT_NOT_FOUND;
let results = items.map(
item => {
return fieldSelectors.map(
selector => {
let [cls, attr] = selector.split('@').slice(0, 2);
let fieldVals = Array.from(item.querySelectorAll(cls));
if (!fieldVals.length) {
return;
}
fieldFound[selector] = true;
return fieldVals.map(find => attr ? find[attr] : find.textContent.trim()).join('\n')
}
)
}
);
// if it exists a field, which is not found in any row, the sender should retry.
let shouldWait = fieldSelectors.reduce((p, c) => p || !fieldFound[c], false);
return shouldWait ? MSG_ELEMENT_NOT_FOUND : results
}
/**
* Repeatedly execute an function until the the detector returns true.
* @param {object} fn the function to execute
* @param {object} detector the detector.
* @param {string} log messages logged to console.
* @param {number} interval interval for detecting
* @param {number} limit max execute times of a function
* @return {Promise} a promise of the response.
*/
function executeUntil(fn, detector, log, interval, limit) {
interval = interval || 500;
let count = 0;
return new Promise((resolve, reject) => {
loop();
async function loop() {
fn();
limit++;
if (limit && count >= limit) {
reject(false);
}
setTimeout(() => {
let flag = !detector || detector();
if (log) console.log(log, flag ? '(OK)' : '(failed)');
if (flag) {
resolve(true);
} else {
loop();
}
}, interval);
}
});
}
})();

View File

@ -1,10 +0,0 @@
const EXT_NAME = "DataExtracter";
const ACTION_EXTRACT = `${EXT_NAME}:Extract`;
const ACTION_GOTO_URL = `${EXT_NAME}:GoToTUL`;
const ACTION_REPORT_IN = `${EXT_NAME}:ReportIn`;
const ACTION_QUERY_URL = `${EXT_NAME}:QueryURL`;
const ACTION_SCROLL_BOTTOM = `${EXT_NAME}:ScrollToBottom`;
const ACTION_UPLOAD_STATE = `${EXT_NAME}:UploadStateFile`;
const ACTION_SLEEP = `${EXT_NAME}:Sleep`;
const ACTION_WAKEUP = `${EXT_NAME}:WakeUp`;

File diff suppressed because one or more lines are too long

202
src/background/actions.ts Normal file
View File

@ -0,0 +1,202 @@
import { Actions, Request } from "../common";
import { sendMessage, ResponseChecker } from "./messaging";
import { logger } from "../common/logger";
/**
* redirect tab to url.
* @param {any} tab target tab
* @param {string} url target URL
* @returns {Promise<string[]>} a promise of target URL
*/
export function redirectTab(tab: chrome.tabs.Tab, url: string, check?: boolean) {
return queryUrl(tab).then(u => {
if (url !== u) {
let req: Request = {
action: Actions.GOTO_URL,
url: url
}
let checker: ResponseChecker<string> = !check ? undefined : async (r, err, tryCount): Promise<string> => {
let queryErr: any;
let newURL = await queryUrl(tab).catch(e => queryErr = e);
if (queryErr) {
throw queryErr;
}
if (newURL == url) return url;
if (
confirm(`Cannot navigate to target url.
expected: ${url}\n
actual: ${newURL}\n
Press OK to continue, Cancel to retry. Close the tab to stop`)
) {
return newURL;
}
return undefined;
}
return sendMessage<string>(tab, req, `Goto url: ${url}`, checker);
}
});
}
/**
* extract data in from the target tab.
* @param {any} tab target tab
* @param {string} itemsSelector items selectors for selecting items (data rows)
* @param {Array<string>} fieldSelectors fields selectors for selecting fields (data columns) under each item
* @returns {Promise<string[]>} a promise of extracted data
*/
export function extractTabData(tab: chrome.tabs.Tab, itemsSelector: string, fieldSelectors: string[], expectedURL?: string, askOnfail?: boolean) {
let req: Request = {
action: Actions.EXTRACT,
itemsSelector: itemsSelector,
fieldSelectors: fieldSelectors,
url: expectedURL,
}
let checker: ResponseChecker<string[][]> = (response, err, tryCount) => {
if (response.error) throw response.error;
let result = response.result;
if (!result || !result.length) {
if (
tryCount % 20 == 0 && (
!askOnfail ||
confirm('No data found in current page. \n\nContinue to next page?')
)
) {
logger.warn(`Failed after ${tryCount} tries: ${tab.url}`)
return [];
} else {
return undefined;
}
}
return result;
};
return sendMessage<string[][]>(tab, req, 'Extract data from the tab...', checker);
}
/**
* ping target tab, usually used to detect if the content script is ready.
* @param {any} tab target tab
* @returns {Promise<boolean>} a promise of boolean value indicates if ping success
*/
export async function ping(tab, count = 1) {
let req = {
action: Actions.PING
}
let checker: ResponseChecker<string> = (r, e, c) =>
r.result == "pong" ? r.result : undefined;
let pong = await sendMessage<string>(tab, req, 'Check tab availability...', checker, 1000, 1000, count).catch(() => { });
return pong == "pong";
}
/**
* get the url of the target tab
* @param {any} tab target tab
* @returns {Promise<string>} a promise of the url
*/
export function queryUrl(tab: chrome.tabs.Tab) {
let req = {
action: Actions.QUERY_URL
}
return sendMessage<string>(tab, req);
}
/**
* get the url of the target tab
* @param {any} tab target tab
* @param {string} expected if specified, queryUrl resolves only when tab url equals to expected
* @returns {Promise<string>} a promise of the url
*/
export function scrollToBottom(tab: chrome.tabs.Tab) {
let req = {
action: Actions.SCROLL_BOTTOM
}
return sendMessage(tab, req, 'Scroll to page bottom...');
}
export async function createTab(url: string, active: boolean): Promise<chrome.tabs.Tab> {
return new Promise((resolve, reject) => {
findIncognitoWindow().then(
incognitoWindow => {
chrome.tabs.create({
'url': url,
'active': active,
// createTab to incognito window first
'windowId': incognitoWindow ? incognitoWindow.id : undefined
}, function (tab) {
resolve(tab);
})
}
);
});
}
export async function findIncognitoWindow(): Promise<chrome.windows.Window> {
return new Promise((resolve, reject) => {
chrome.windows.getAll(
{
windowTypes: ['normal'],
},
(windows: chrome.windows.Window[]) => {
for (let window of windows) {
if (window.incognito) {
resolve(window);
return;
}
}
resolve(undefined);
}
);
});
}
export async function getCurrentWindow(): Promise<chrome.windows.Window> {
return new Promise((resolve, reject) => {
chrome.windows.getCurrent(
(windows: chrome.windows.Window) => {
return resolve(windows);
}
);
});
}
export async function getWindowByID(id: number) {
return new Promise<chrome.windows.Window>((resolve, reject) => {
chrome.windows.get(id, function (window) {
chrome.runtime.lastError;
resolve(window);
})
})
}
export async function CreateIncognitoWindow() {
return new Promise((resolve, reject) => {
chrome.windows.create(
<chrome.windows.CreateData>{
incognito: true,
},
(window: chrome.windows.Window) => {
resolve(window);
}
);
});
}
export async function getActiveTab(currentWindow: boolean): Promise<chrome.tabs.Tab> {
return new Promise((resolve, reject) => {
chrome.tabs.query({
active: true,
currentWindow: currentWindow
}, function (tabs) {
resolve(tabs[0]);
})
})
}
export async function getTabByID(id: number): Promise<chrome.tabs.Tab> {
return new Promise((resolve, reject) => {
chrome.tabs.get(id, function (tab) {
chrome.runtime.lastError;
resolve(tab);
})
})
}

31
src/background/caches.ts Normal file
View File

@ -0,0 +1,31 @@
import { logger } from "../common/logger";
import { Actions } from "../common";
import { messageSubscribers } from "./messaging";
export class Caches {
private _state: string = "";
constructor() {
messageSubscribers.addListener(Actions.UPLOAD_STATE, (request, sender, sendResponse) => {
sendResponse('recieved!');
this.setState(request.fileName, request.state)
});
}
get state(): string {
let s = this._state;
this._state = "";
return s;
}
setState(name: string, content: string) {
this._state = content;
logger.info(`State (${name}) recieved. To load it: some_var = new Extractor().load()`);
// clear cache in 30 seconds
setTimeout(() => {
if (this._state) {
logger.info(`Uploaded state is cleaned after 30 second.`);
this._state = "";
}
}, 30000);
}
}
export const caches = new Caches();

View File

@ -1,10 +1,24 @@
var __EXTRACTOR_STATE__ = ""; import { Task } from "./task";
import { parseUrls, saveFile } from "./tools";
import { createTab, getActiveTab, ping, redirectTab } from "./actions";
import { logger } from "../common/logger";
import { caches } from "./caches";
import { ExtractResult } from "./result";
class Extractor { export class Extractor {
constructor(options) { private _tasks: Task[] = [];
this._tasks = []; private _running = false;
this._running = false; private _options: any = {};
this._options = options; constructor(options?) {
if (options) this._options = options;
}
static async ping(count: number = 1) {
let tab = await getActiveTab(true) || await getActiveTab(false);
let succ = await ping(tab, count);
if (!succ) {
logger.error('Cannot contact with active tab.');
return;
}
} }
/** /**
* Save current state, in case we restore it later. * Save current state, in case we restore it later.
@ -16,12 +30,12 @@ class Extractor {
* Restore previous state by loading from saved state. * Restore previous state by loading from saved state.
*/ */
load() { load() {
if (!__EXTRACTOR_STATE__) { let content = caches.state;
if (!content) {
logger.info('No state found. Please upload a saved state from the popup window first.'); logger.info('No state found. Please upload a saved state from the popup window first.');
return; return;
} }
let state = JSON.parse(__EXTRACTOR_STATE__); let state = JSON.parse(content);
__EXTRACTOR_STATE__ = "";
this._options = state._options; this._options = state._options;
this._tasks = state._tasks.map(t => new Task(this._options, 'whaterver', ['whaterver']).load(t)); this._tasks = state._tasks.map(t => new Task(this._options, 'whaterver', ['whaterver']).load(t));
return this; return this;
@ -32,10 +46,21 @@ class Extractor {
* If url arguments not given within later tasks, they will use previous task result as input (target url list). * If url arguments not given within later tasks, they will use previous task result as input (target url list).
* @param {...any} args itemsSelector, fieldSelectors, and more args to specify target urls. * @param {...any} args itemsSelector, fieldSelectors, and more args to specify target urls.
*/ */
task(...args) { task(...args: any) {
this._tasks.push(new Task(this._options, ...args)); this._tasks.push(new Task(this._options, ...args));
return this; return this;
} }
/**
* Add a task to Extractor. \n
* One Extractor could has multiple tasks, which orgnized in a task chian.
* If url arguments not given within later tasks, they will use previous task result as input (target url list).
* @param {...any} args itemsSelector, fieldSelectors, and more args to specify target urls.
*/
results(id?: number): ExtractResult {
id = this._checkTaskId(id);
if (id < 0) return;
return this._tasks[id].results;
}
/** /**
* Clear tasks and task caches. * Clear tasks and task caches.
*/ */
@ -46,14 +71,30 @@ class Extractor {
/** /**
* Start the task chain. * Start the task chain.
*/ */
async start() { start() {
return this._startTasks(0); return this._startTasks(0);
} }
stop(id?: number) {
if (id !== undefined) {
id = this._checkTaskId(id);
if (id < 0) return;
this._tasks[id].stop();
return;
}
for (let i = 0; i < this._tasks.length; i++) {
this._tasks[i].stop();
}
}
watch(id: number) {
id = this._checkTaskId(id);
if (id < 0) return;
this._tasks[id].watch();
}
/** /**
* restart from specified task, but don't restart the previous tasks. * restart from specified task, but don't restart the previous tasks.
* @param {number} from where to restart the tasks, begins with 0 * @param {number} from where to restart the tasks, begins with 0
*/ */
async restart(from = 0) { restart(from: number = 0) {
let id = this._checkTaskId(from, 0); let id = this._checkTaskId(from, 0);
if (id < 0) return; if (id < 0) return;
for (let i = id; i < this._tasks.length; i++) { for (let i = id; i < this._tasks.length; i++) {
@ -61,7 +102,7 @@ class Extractor {
} }
return this._startTasks(0); return this._startTasks(0);
} }
async _startTasks(from) { async _startTasks(from: number) {
if (this._running) { if (this._running) {
logger.info('The Extractor is running. Please wait..'); logger.info('The Extractor is running. Please wait..');
return; return;
@ -85,17 +126,17 @@ class Extractor {
} }
} }
this._running = true; this._running = true;
return this._tasks.reduce((pms, task, i) => { return this._tasks.reduce((pms, task: Task, i: number) => {
return pms.then( return pms.then(
() => { () => {
if (i < from) return; if (i < from) return;
if (i > 0) { if (i > 0) {
let prevTask = this._tasks[i - 1]; let prevTask = this._tasks[i - 1];
return task.execute(tab, new ExtractResult(prevTask.results)); return task.execute(tab, prevTask.results);
} }
return task.execute(tab, undefined); return task.execute(tab);
}); });
}, Promise.resolve(undefined)).then( }, Promise.resolve<void>(undefined)).then(
() => { () => {
this._running = false; this._running = false;
this.export(); this.export();
@ -109,26 +150,26 @@ class Extractor {
* export result of a task to CSV * export result of a task to CSV
* @param {number} taskid which task id to save, begins with 0 * @param {number} taskid which task id to save, begins with 0
*/ */
export(taskid) { export(taskid?: number) {
let id = this._checkTaskId(taskid, this._tasks.length - 1); let id = this._checkTaskId(taskid, this._tasks.length - 1);
if (id < 0) return; if (id < 0) return;
let results = this._tasks[id].results let results = this._tasks[id].results
if (!results.length) { let count = results.data.length
if (!count) {
logger.info(`No result for task #${id}. Forget to call ".start()"?`); logger.info(`No result for task #${id}. Forget to call ".start()"?`);
return; return;
} }
results.unshift(this._tasks[id].fieldSelectors); results.header = this._tasks[id].fieldSelectors;
let exResults = new ExtractResult(results);
let msg = ` let msg = `
Please confirm to download (${results.length - 1} items) Please confirm to download (${count} items)
${exResults.toString(50) || "- Empty -"} ${results.toString(50) || "- Empty -"}
`.trim(); `.trim();
if (confirm(msg)) { if (confirm(msg)) {
saveFile(exResults, "text/csv"); saveFile(results.toString(), "text/csv");
} }
} }
_checkTaskId(id, defaultId) { private _checkTaskId(id: number, defaultId?: number) {
if (!this._tasks.length) { if (!this._tasks.length) {
logger.info("No task found."); logger.info("No task found.");
return -1; return -1;

14
src/background/index.ts Normal file
View File

@ -0,0 +1,14 @@
import { Extractor } from "./extractor";
declare global {
interface Window {
$: (...args: any) => void;
Extractor: any;
}
}
window.$ = function (...args) {
return new Extractor().task(...args).start();
}
window.Extractor = Extractor;

150
src/background/messaging.ts Normal file
View File

@ -0,0 +1,150 @@
import { Request, Actions, Response } from "../common";
import { getTabByID } from "./actions";
import { logger } from "../common/logger";
export type ResponseCheckerSync<T> = (r: Response<T>, err: chrome.runtime.LastError, count: number) => T;
export type ResponseCheckerAsync<T> = (r: Response<T>, err: chrome.runtime.LastError, count: number) => Promise<T>;
export type ResponseChecker<T> = ResponseCheckerSync<T> | ResponseCheckerAsync<T>;
/**
* Sending a message to target tab repeatedly until the response is not undefined.
* @param {object} tab the table where to send the message
* @param {object} req the request data.
* @param {function} dataChecker (reulst:any, err:error, tryCount:number) => any.
* Check and decide what value finally returns.
* Return undefined to make 'sendMessage' retry.
* Return MSG_USER_ABORT to cancel this promise.
* @param {number} interval retry interval, default: 500ms.
* @param {number} limit retry limit, default: 0, no limit.
* @param {string} log messages logged to console.
* @return {Promise} a promise of the response.
*/
export function sendMessage<T>(
tab: chrome.tabs.Tab,
req: Request,
log?: string,
dataChecker?: ResponseChecker<T>,
timeout?: number,
interval?: number,
limit?: number
) {
timeout = timeout || 10;
interval = interval || 500;
limit = isNaN(limit) ? 0 : limit;
let count = 0;
return new Promise<T>((resolve, reject) => {
loop();
async function loop() {
logger.debug("Request for", Actions[req.action]);
let tabAvailable = await getTabByID(tab.id);
if (!tabAvailable) {
reject("Task interrupted due to the target tab is closed.");
return;
}
if (limit && count >= limit) {
reject(`sendMessage loop limit ${limit} reached.`);
return;
}
count++;
let timeout = setTimeout(() => { reject(`${Actions[req.action]} requset timeout after ${timeout}s`) }, 10000);
chrome.tabs.sendMessage(tab.id, req, async (r: Response<T>) => {
clearTimeout(timeout);
// check error but do nothing until dataChecker.
let err = chrome.runtime.lastError;
let [result, error] = await checkResponse(dataChecker, r, err, count);
if (error) {
reject(error);
return;
}
let flag = result !== undefined;
if (log) logger.info(log, flag ? '(OK)' : '(failed)');
if (flag) {
resolve(result);
} else {
setTimeout(() => {
logger.debug('Invalid response', r, 'retry...');
loop();
}, interval);
}
});
}
});
}
async function checkResponse<T>(
dataChecker: ResponseChecker<T>,
response: Response<T>,
error: chrome.runtime.LastError,
tryCount: number
): Promise<[T, string]> {
// response could be undefined if the content script is interrupted.
// don't check, tell sendMessage to retry.
if (!response) return [undefined, undefined];
if (!dataChecker) {
return [response.result, response.error];
}
let result: T;
let pms: T | Promise<T>;
try {
pms = dataChecker(response, error, tryCount);
} catch (err) {
return [undefined, err];
}
// don't catch if it's not a Promise
if (pms instanceof Promise) {
let checkerError: any;
pms = pms.catch(e => checkerError = e);
result = await pms;
if (checkerError) {
return [undefined, checkerError];
}
} else {
result = pms;
}
return [result, undefined];
}
export type ActionSubscriberSync = (request: Request, sender: chrome.runtime.MessageSender, sendResponse: (response?: any) => void) => void;
export type ActionSubscriberAsync = (request: Request, sender: chrome.runtime.MessageSender, sendResponse: (response?: any) => void) => Promise<void>;
export type ActionSubscriber = ActionSubscriberSync | ActionSubscriberAsync;
class MessageSubscribers {
private listeners: { [key: number]: ActionSubscriber[] } = {};
addListener(action: Actions, subscriber: ActionSubscriber) {
this.listeners[action] || (this.listeners[action] = []);
this.listeners[action].push(subscriber);
}
removeListener(action: Actions, subscriber: ActionSubscriber) {
this.listeners[action] || (this.listeners[action] = []);
for (let i = 0; i < this.listeners[action].length; i++) {
if (this.listeners[action][i] == subscriber) {
this.listeners[action].splice(i, 1);
i--;
}
}
logger.debug(`${this.listeners[action].length} subscriber(s) remained for action ${Actions[action]}`);
}
getListeners(action: Actions): ActionSubscriber[] {
return this.listeners[action]
}
}
export const messageSubscribers = new MessageSubscribers();
chrome.runtime.onMessage.addListener(function (request: Request, sender, sendResponse) {
let subscribers = messageSubscribers.getListeners(request.action);
if (!subscribers || !subscribers.length) {
sendResponse("Request not supported.");
return;
}
let promises: Promise<any>[] = [];
for (let subscriber of subscribers) {
let p = subscriber(request, sender, sendResponse);
if (p instanceof Promise) promises.push(p);
}
if (promises.length)
return Promise.all(promises);
return;
});

85
src/background/result.ts Normal file
View File

@ -0,0 +1,85 @@
import { logger } from "../common/logger";
import { getActiveTab, ping, redirectTab } from "./actions";
import { parseUrls } from "./tools";
export class ExtractResult {
private _header: string[];
private _data: string[][] = [];
constructor(data: string[][]) {
this._data = data || [];
}
row(index: number): string[] {
return this._data[index];
}
column(index: number): string[] {
return [...new Array(this._data.length).keys()].map(
i => this._data[i][index]
);
}
squash(): string[] {
return this._data.reduce((p, c) => p.concat(c), []);
}
set header(h: string[]) {
this._header = h
}
get data(): string[][] {
return this._data;
}
toString(rowsCount: number = 0): string {
let data = rowsCount > 0 ? this._data.slice(0, rowsCount) : this._data;
if (this._header && this._header.length) {
data.unshift(this._header);
}
return data.slice().reduce(
(csv, lineCells) => {
if (!lineCells || !lineCells.length) {
return csv + "\n";
}
let line = lineCells.reduce(
(lineText, cell, idx) => {
cell = cell || "";
cell = '"' + cell.trim().replace(/"/g, '""') + '"';
return lineText + cell + (idx == lineCells.length - 1 ? "" : ",")
}, "");
return csv + line + "\n";
},
""
);
}
async walk(fn: (row: number, col: number, value: string) => void) {
let pms = Promise.resolve(null);
for (let i = 0; i < this._data.length; i++) {
let cells = this._data[i];
for (let j = 0; j < cells.length; j++) {
let row = i;
let col = j;
let value = cells[j];
pms = pms.then(
() => fn(row, col, value)
)
}
}
return pms.catch(err => {
logger.error(err);
});
}
async visit() {
let urls = parseUrls(this);
let tab = await getActiveTab(true) || await getActiveTab(false);
let succ = await ping(tab);
if (!succ) {
logger.error('Cannot contact with active tab.');
return;
}
return urls.reduce(
(pms, url: string, i: number) => {
return pms.then(
async () => {
return redirectTab(tab, url, false);
});
}, Promise.resolve<void>(undefined)
).catch(err => {
logger.error(err);
});
}
}

View File

@ -1,4 +1,6 @@
const signitures = ` import { ExtractResult } from "./result";
export const signitures = `
## Usage ## Usage
// single task // single task
$(...args); $(...args);
@ -18,12 +20,13 @@ function(itemsSelector:string, fieldSelectors:string[], urls:string[]);
$(".item", ["a", "a@href"]); $(".item", ["a", "a@href"]);
## See Detailed Help: ## See Detailed Help:
https://git.jebbs.co/jebbs/data-extracter-extesion https://git.qjebbs.com/jebbs/data-extracter-extesion
`.trim(); `.trim();
function testArgs(...args) { export function testArgs(...args: any) {
switch (args.length) { switch (args.length) {
case 0, 1: case 0:
case 1:
return false; return false;
case 2: case 2:
return args[0] && args[1] && return args[0] && args[1] &&
@ -66,7 +69,3 @@ function testArgs(...args) {
return arr.reduce((p, c) => p && tester(c), true); return arr.reduce((p, c) => p && tester(c), true);
} }
} }
function argsToString(...args) {
return args.map(v => (v instanceof Array ? `[${v.join(', ')}]` : v.toString())).join(', ');
}

178
src/background/task.ts Normal file
View File

@ -0,0 +1,178 @@
import { parseUrls } from "./tools";
import { queryUrl, redirectTab, scrollToBottom, extractTabData, findIncognitoWindow, getCurrentWindow, getWindowByID } from "./actions";
import { testArgs, signitures } from "./signiture";
import { ExtractResult } from "./result";
import { messageSubscribers, ActionSubscriber } from "./messaging";
import { Actions } from "../common";
import { logger } from "../common/logger";
export class Task {
private _data: { [key: string]: string[][] } = {};
private _data_keys: string[] = [];
private _options: any;
private _itemsSelector: string;
private _fieldSelectors: string[];
private _urls: string[] = [];
private _running = false;
private _listeners: ActionSubscriber[] = [];
constructor(options: any, ...arg: any);
constructor(options: any, itemsSelector: string, fieldSelectors: string[]);
constructor(options: any, itemsSelector: string, fieldSelectors: string[], url: string, from: number, to: number, interval: number);
constructor(options: any, itemsSelector: string, fieldSelectors: string[], url: string, pages: number[]);
constructor(options: any, itemsSelector: string, fieldSelectors: string[], urls: string[]);
constructor(options, ...args) {
if (!testArgs(...args))
throw new Error(`Invalid call arguments.\n\n${signitures}\n\n`);
this._options = options;
this._itemsSelector = args.shift();
this._fieldSelectors = args.shift();
this._urls = parseUrls(...args);
}
load(state: any): Task {
this._itemsSelector = state._itemsSelector;
this._data = state._data;
this._data_keys = state._data_keys;
this._itemsSelector = state._itemsSelector;
this._fieldSelectors = state._fieldSelectors;
this._urls = state._urls;
return this;
}
get urls(): string[] {
return this._urls;
}
get results(): ExtractResult {
let rs: string[][] = this._data_keys.reduce((p, c) => {
return p.concat(this._data[c]);
}, []);
return new ExtractResult(rs);
}
get fieldSelectors(): string[] {
return this._fieldSelectors;
}
clean(): Task {
this.stop();
this._data = {};
this._data_keys = [];
return this;
}
stop() {
this._running = false;
let listener: ActionSubscriber;
while (listener = this._listeners.pop()) {
messageSubscribers.removeListener(Actions.REPORT_NEW_PAGE, listener);
}
}
async watch() {
if (this._running) {
logger.info("The task is running. Please wait...");
return;
}
this._running = true;
let window = await findIncognitoWindow() || await getCurrentWindow();
if (!window) {
logger.info("No window to watch...");
return;
}
let watchTaskID = 0;
let listener: ActionSubscriber = async (request, sender, sendResponse) => {
let findWindow = await getWindowByID(window.id);
if (!findWindow) {
// stop watch on window close.
messageSubscribers.removeListener(Actions.REPORT_NEW_PAGE, listener);
return;
}
// only watch current window.
if (sender.tab.windowId != window.id) return;
let taskID = watchTaskID++;
logger.info(`Watcher #${taskID} starts.`);
let pm = this.makeOptionalTasks(sender.tab);
return pm.then(
() => extractTabData(sender.tab, this._itemsSelector, this._fieldSelectors, sender.tab.url, true)
).then(
results => {
if (results && results.length) {
this.saveResult(results, sender.tab.url);
}
logger.info(`Watcher #${taskID} ends.`);
}
).catch(
e => logger.error(`Watcher #${taskID} ends with:`, e)
)
}
this._listeners.push(listener);
messageSubscribers.addListener(Actions.REPORT_NEW_PAGE, listener);
}
async execute(tab: chrome.tabs.Tab, upstreamData?: ExtractResult): Promise<void> {
if (!tab) throw "No tab to execute the task.";
if (this._running) throw "The task is running. Please wait...";
this._running = true;
let urls = this._urls
if (!urls.length) {
if (upstreamData) {
urls = parseUrls(upstreamData);
} else {
let tabURL: string;
await queryUrl(tab)
.then(u => {
tabURL = u;
})
.catch(() => {
e => {
this._running = false;
return Promise.reject(e);
}
});
urls = [tabURL];
}
}
return urls.reduce((p, url, i) => p.then(
results => {
if (i > 0 && results instanceof Array) {
let lastURL = urls[i - 1];
this.saveResult(results, lastURL);
}
if (this._data[url]) return;
let pms: Promise<any> = this.runningCheck(() => redirectTab(tab, url));
return pms
.then(() => this.makeOptionalTasks(tab))
.then(
() => this.runningCheck(() => extractTabData(tab, this._itemsSelector, this._fieldSelectors))
);
}
), Promise.resolve<string[][]>(null)).then(
results => {
if (results && results.length) {
let lastURL = urls[urls.length - 1];
this.saveResult(results, lastURL);
}
this._running = false;
}
).catch(
e => {
this._running = false;
throw e;
}
);
}
private makeOptionalTasks(tab: chrome.tabs.Tab): Promise<any> {
let pm: Promise<any>;
if (this._options["scrollToBottom"]) {
pm = this.runningCheck(() => scrollToBottom(tab));
}
return pm;
}
private runningCheck(fn: () => Promise<any>): Promise<any> {
if (!this._running) throw "The task is stopped by user.";
return fn();
}
private saveResult(results, key) {
if (this._data[key] === undefined) {
// do not add keys again
this._data_keys.push(key);
}
this._data[key] = results;
logger.info(`${results.length} items found.`)
}
}

62
src/background/tools.ts Normal file
View File

@ -0,0 +1,62 @@
import { ExtractResult } from "./result";
const URL_REG = /^\s*(https?):\/\//im;
export function parseUrls(...args): string[] {
if (!args.length) return [];
let arg = args.shift();
if (arg instanceof Array) {
return arg;
} else if (arg instanceof ExtractResult) {
return arg.squash().filter(v => URL_REG.test(v));
} else {
let urlTempl = arg;
if (urlTempl) {
if (args[0] instanceof Array) {
return args[0].map(p => urlTempl.replace("${page}", p));
} else if (args.length >= 3) {
let urls = [];
let from = args.shift();
let to = args.shift();
let interval = args.shift();
for (let i = from; i <= to; i += interval) {
urls.push(urlTempl.replace("${page}", i));
}
return urls;
}
}
}
return [];
}
export function saveFile(data: string, mimeType: string, fileName?: string) {
fileName = fileName || document.title || "result";
let blob: Blob;
if (typeof window.Blob == "function") {
blob = new Blob([data], {
type: mimeType
})
} else {
var BlobBuiler = window.MSBlobBuilder;
var builer = new BlobBuiler();
builer.append(data);
blob = builer.getBlob(mimeType)
}
var URL = window.URL || window.webkitURL;
var url = URL.createObjectURL(blob);
var link = document.createElement("a");
if ('download' in link) {
link.style.visibility = "hidden";
link.href = url;
link.download = fileName;
document.body.appendChild(link);
var j = document.createEvent("MouseEvents");
j.initEvent("click", true, true);
link.dispatchEvent(j);
document.body.removeChild(link)
} else if (navigator.msSaveBlob) {
navigator.msSaveBlob(blob, fileName)
} else {
location.href = url
}
}

28
src/common/index.ts Normal file
View File

@ -0,0 +1,28 @@
export enum Actions {
// from background to content script
EXTRACT = 1,
GOTO_URL,
PING,
QUERY_URL,
SCROLL_BOTTOM,
SLEEP,
WAKEUP,
// from popup to background script
UPLOAD_STATE,
// from content to background script
REPORT_NEW_PAGE,
}
export interface Request {
action: Actions
itemsSelector?: string
fieldSelectors?: string[]
url?: string
fileName?: string
state?: string
}
export interface Response<T> {
result: T;
error: string;
}

View File

@ -1,42 +1,36 @@
const LOGGER_LEVEL = { export enum LOGGER_LEVEL {
DEBUG: 1, DEBUG = 1,
INFO: 2, INFO,
WARNING: 3, WARN,
ERROR: 4, ERROR,
DISABLED: 100, DISABLED,
properties: {
1: { name: "debug", value: 1, prefix: "DEBUG" },
2: { name: "info", value: 2, prefix: "INFO" },
3: { name: "warning", value: 3, prefix: "WARN" },
4: { name: "error", value: 3, prefix: "ERROR" }
}
}; };
class Logger { export class Logger {
_notificationId = undefined; private _notificationId = undefined;
_log_level = LOGGER_LEVEL.INFO; private _log_level = LOGGER_LEVEL.INFO;
_notify_level = LOGGER_LEVEL.ERROR; private _notify_level = LOGGER_LEVEL.ERROR;
constructor(logLevel, notifyLevel) { constructor(logLevel, notifyLevel) {
if (logLevel) this._log_level = logLevel; if (logLevel) this._log_level = logLevel;
if (notifyLevel) this._notify_level = notifyLevel; if (notifyLevel) this._notify_level = notifyLevel;
chrome.notifications.onClosed.addListener((id, byUser) => { this._notify_level = undefined }); if (chrome.notifications) chrome.notifications.onClosed.addListener((id, byUser) => { this._notify_level = undefined });
} }
get logLevel() { get logLevel() {
return this._log_level; return this._log_level;
} }
set logLevel(val) { set logLevel(val: LOGGER_LEVEL) {
this._log_level = val; this._log_level = val;
} }
get notifyLevel() { get notifyLevel() {
return this._notify_level; return this._notify_level;
} }
set notifyLevel(val) { set notifyLevel(val: LOGGER_LEVEL) {
this._notify_level = val; this._notify_level = val;
} }
log(level, loggerFn, ...msgs) { log(level: LOGGER_LEVEL, loggerFn: Function, ...msgs) {
if (level < this._log_level) return; if (level < this._log_level) return;
let time = new Date().toLocaleString(); let time = new Date().toLocaleString();
loggerFn(`${time} [${LOGGER_LEVEL.properties[level].prefix}]`, ...msgs); loggerFn(`${time} [${LOGGER_LEVEL[level]}]`, ...msgs);
if (level < this._notify_level) return; if (level < this._notify_level) return;
this.notify(...msgs); this.notify(...msgs);
} }
@ -47,7 +41,7 @@ class Logger {
this.log(LOGGER_LEVEL.INFO, console.info, ...msgs); this.log(LOGGER_LEVEL.INFO, console.info, ...msgs);
} }
warn(...msgs) { warn(...msgs) {
this.log(LOGGER_LEVEL.WARNING, console.info, ...msgs); this.log(LOGGER_LEVEL.WARN, console.info, ...msgs);
} }
error(...msgs) { error(...msgs) {
this.log(LOGGER_LEVEL.ERROR, console.info, ...msgs); this.log(LOGGER_LEVEL.ERROR, console.info, ...msgs);
@ -78,4 +72,4 @@ class Logger {
} }
} }
const logger = new Logger(LOGGER_LEVEL.DEBUG, LOGGER_LEVEL.DISABLED); export const logger = new Logger(LOGGER_LEVEL.DEBUG, LOGGER_LEVEL.DISABLED);

101
src/content/actions.ts Normal file
View File

@ -0,0 +1,101 @@
import { logger } from "../common/logger";
export function extract(itemsSelector: string, fieldSelectors: string[], expectedURL: string): string[][] {
if (expectedURL && location.href != expectedURL) {
throw 'Target tab URL changed, aborting...';
}
// since some elements may be loaded asynchronously.
// if one field is never found, we should return undefined,
// so that senders can detect to retry until elements loaded.
// If user writes wrong selectors, the task retries infinitely.
let fieldFound: { [key: string]: boolean } = {};
let items: Element[] = Array.from(document.querySelectorAll(itemsSelector));
// items may not loaded yet, tell the sender to retry.
if (!items.length) return [];
let results: string[][] = items.map(
item => {
return fieldSelectors.map(
selector => {
let doClick = false;
if (selector.startsWith("!")) {
doClick = true;
selector = selector.substring(1);
}
let [cls, attr] = selector.split('@').slice(0, 2);
let fieldElements: Element[];
cls = cls.trim()
if (cls != "") {
fieldElements = Array.from(item.querySelectorAll(cls));
} else {
fieldElements = [item];
}
if (!fieldElements.length) {
return;
}
fieldFound[selector] = true;
return fieldElements.map(find => {
if (doClick) {
let e = document.createEvent("MouseEvents");
e.initEvent("click", true, true);
find.dispatchEvent(e);
}
return attr ? find[attr] : find.textContent.trim();
}).join('\n')
}
)
}
);
// TODO: configurable wait logic
// if it exists a field, which is not found in any row, the sender should retry.
let notFoundFields = fieldSelectors.filter(f => !fieldFound[f]);
let shouldWait = notFoundFields.length > 0;
if (shouldWait) {
logger.debug('should wait for:', fieldSelectors.filter(f => !fieldFound[f]).join(','));
}
return shouldWait ? [] : results;
}
export function scrollToBottom() {
return executeUntil(
() => window.scrollTo(0, document.body.clientHeight),
() => document.body.clientHeight - window.scrollY - window.innerHeight < 20,
"Scroll to page bottom...",
1000,
10
);
}
/**
* Repeatedly execute an function until the the detector returns true.
* @param {object} fn the function to execute
* @param {object} detector the detector.
* @param {string} log messages logged to console.
* @param {number} interval interval for detecting
* @param {number} limit max execute times of a function
* @return {Promise} a promise of the response.
*/
function executeUntil(fn: () => void, detector: () => boolean, log: string, interval: number, limit: number) {
interval = interval || 500;
let count = 0;
return new Promise<boolean>((resolve, reject) => {
loop();
async function loop() {
fn();
limit++;
if (limit && count >= limit) {
reject(false);
}
setTimeout(() => {
let flag = !detector || detector();
if (log) console.log(log, flag ? '(OK)' : '(failed)');
if (flag) {
resolve(true);
} else {
loop();
}
}, interval);
}
});
}

75
src/content/index.ts Normal file
View File

@ -0,0 +1,75 @@
import { Request, Actions, Response } from '../common';
import { scrollToBottom, extract } from './actions';
let asleep = false;
chrome.runtime.onMessage.addListener(
function (request, sender: chrome.runtime.MessageSender, sendResponse: (r: any) => void) {
if (!request.action) return;
if (asleep && Actions.WAKEUP != request.action) {
sendResponse && sendResponse(undefined);
return;
}
// console.log("Recieved request:",request);
doAction(request, sender).then(r => sendResponse && sendResponse(r));
// return true to indicate you wish to send a response asynchronously
return true;
}
);
chrome.runtime.sendMessage(<Request>{
action: Actions.REPORT_NEW_PAGE,
});
async function doAction(request: Request, sender: chrome.runtime.MessageSender): Promise<Response<any>> {
let result: any;
let error: string;
try {
switch (request.action) {
case Actions.EXTRACT:
result = extract(request.itemsSelector, request.fieldSelectors, request.url);
break;
case Actions.GOTO_URL:
window.location.replace(request.url);
// should not recieve any request until the page & script reload
asleep = true;
result = request.url;
break;
case Actions.PING:
result = "pong";
break;
case Actions.QUERY_URL:
result = window.location.href;
break;
case Actions.SCROLL_BOTTOM:
result = scrollToBottom();
break;
case Actions.SLEEP:
asleep = true;
result = "Content script is sleeping.";
break;
case Actions.WAKEUP:
asleep = false;
result = "Content script is available.";
break;
default:
error = 'Unsupported action.'
break;
}
} catch (err) {
if (err instanceof Error) {
error = err.message;
} else {
error = err;
}
}
return newResponse(result, error);
}
function newResponse<T>(result: T, err?: string): Response<T> {
let r: Response<T> = {
result: result,
error: err,
}
return r;
}

View File

@ -1,3 +1,5 @@
import { Request, Actions } from '../common';
window.onload = function () { window.onload = function () {
document.querySelector('#link-extension-detail') document.querySelector('#link-extension-detail')
.addEventListener('click', () => { .addEventListener('click', () => {
@ -8,7 +10,7 @@ window.onload = function () {
document.querySelector('#link-document') document.querySelector('#link-document')
.addEventListener('click', () => { .addEventListener('click', () => {
chrome.tabs.create({ chrome.tabs.create({
'url': `https://git.jebbs.co/jebbs/data-extracter-extesion` 'url': `https://git.qjebbs.com/jebbs/data-extracter-extesion`
}); });
}) })
document.querySelector('#state-input') document.querySelector('#state-input')
@ -19,10 +21,10 @@ window.onload = function () {
reader.readAsText(this.files[0], "UTF-8"); reader.readAsText(this.files[0], "UTF-8");
reader.onload = function (evt) { reader.onload = function (evt) {
var fileString = evt.target.result; var fileString = evt.target.result;
chrome.runtime.sendMessage({ chrome.runtime.sendMessage(<Request>{
action: ACTION_UPLOAD_STATE, action: Actions.UPLOAD_STATE,
state: fileString, state: fileString,
name: fileName fileName: fileName
}, r => { }, r => {
if (r) console.log('State sent:', r); if (r) console.log('State sent:', r);
}); });

View File

Before

Width:  |  Height:  |  Size: 36 KiB

After

Width:  |  Height:  |  Size: 36 KiB

View File

@ -3,10 +3,9 @@
<link> <link>
<meta charset="utf-8"> <meta charset="utf-8">
<title>Data Extractor</title> <title>Data Extractor</title>
<script charset="UTF-8" type="text/javascript" src="../scripts/shared/common.js"></script> <script charset="UTF-8" type="text/javascript" src="../scripts/popup.bundle.js"></script>
<script charset="UTF-8" type="text/javascript" src="tip.js"></script>
<link rel="stylesheet" href="styles/bootstrap.min.css"> <link rel="stylesheet" href="../assets/bootstrap.min.css">
</head> </head>
<body style="margin: 20px 10px;"> <body style="margin: 20px 10px;">
@ -19,13 +18,12 @@
<div class="row"> <div class="row">
<div class="col"> <div class="col">
<div class="alert alert-info small"> <div class="alert alert-info small">
<!-- <h6>Usage:</h6> -->
<p> <p>
Goto <a href="#" id="link-extension-detail">Extension Detail</a>, click "backgroud page", Goto <a href="#" id="link-extension-detail">Extension Detail</a>, click "backgroud page",
and type your scripts in the console. and type your scripts in the console.
</p> </p>
<p> <p>
<img src="../images/console.png" alt="" <img src="../assets/console.png" alt=""
style="max-width: 489px; width: 100%; border-radius: 5px"> style="max-width: 489px; width: 100%; border-radius: 5px">
</p> </p>
@ -54,7 +52,7 @@
<p> <p>
<b>Full document at:</b> <b>Full document at:</b>
<br> <br>
<a href="#" id="link-document">https://git.jebbs.co/jebbs/data-extracter-extesion</a> <a href="#" id="link-document">https://git.qjebbs.com/jebbs/data-extracter-extesion</a>
</p> </p>
</div> </div>
</div> </div>
@ -66,7 +64,7 @@
</div> </div>
<div class="row"> <div class="row">
<div class="col"> <div class="col">
<input type="file" name="state" id="state-input"> <input type="file" name="state" id="state-input">
</div> </div>
</div> </div>
</div> </div>

View File

Before

Width:  |  Height:  |  Size: 4.1 KiB

After

Width:  |  Height:  |  Size: 4.1 KiB

35
template/manifest.json Executable file
View File

@ -0,0 +1,35 @@
{
"manifest_version": 2,
"name": "Data Extracter",
"version": "0.5.1",
"author": "jebbs",
"description": "Extract data from web page elements as sheet.",
"icons": {
"16": "icon.png",
"48": "icon.png",
"128": "icon.png"
},
"browser_action": {
"default_icon": "icon.png",
"default_popup": "html/popup.html",
"default_title": "Data Extracter"
},
"background": {
"scripts": [
"scripts/background.bundle.js"
],
"persistent": false
},
"content_scripts": [{
"matches": ["*://*/*"],
"js": [
"scripts/content.bundle.js"
],
"run_at": "document_idle"
}],
"incognito": "spanning",
"permissions": [
"activeTab",
"notifications"
]
}

12
tsconfig.json Normal file
View File

@ -0,0 +1,12 @@
{
"compilerOptions": {
"module": "commonjs",
"target": "es6",
"noImplicitAny": false,
"sourceMap": true,
"rootDir": "src",
"outDir": "dist/js",
"noEmitOnError": true,
"typeRoots": [ "node_modules/@types" ]
}
}

33
webpack.config.js Normal file
View File

@ -0,0 +1,33 @@
const path = require('path');
const CopyPlugin = require('copy-webpack-plugin');
module.exports = {
mode: 'production',
entry: {
background: './src/background/index.ts',
content: './src/content/index.ts',
popup: './src/popup/index.ts',
},
// devtool: 'inline-source-map',
output: {
path: path.resolve(__dirname, 'dist'),
filename: 'scripts/[name].bundle.js'
},
module: {
rules: [
{
test: /\.tsx?$/,
use: 'ts-loader',
exclude: /node_modules/
}
]
},
resolve: {
extensions: ['.tsx', '.ts', '.js']
},
plugins: [
new CopyPlugin([
{ from: '**/*', to: '.', toType: "dir" },
], { context: 'template', logLevel: 'warn' }),
]
};