Compare commits

...

30 Commits

Author SHA1 Message Date
e87e7010ec improvements
* chance to continue on mismatch url for redirectTab
* support empty field selectors
* add Extractor.results()
* add ExtractResult.walk(), ExtractResult.visit()
* add ! directive to click elements
* code optimize
2021-04-20 14:20:05 +08:00
108ebb835f fix task running state 2021-04-20 12:01:10 +08:00
e0b0a5e986 add timeout for messaging 2021-04-20 12:00:59 +08:00
9cd25e3c1d update url 2021-04-19 15:58:04 +08:00
7827d385bd refactor 2020-06-16 14:45:36 +08:00
ade0670415 update readme 2020-01-17 11:01:13 +08:00
63aec616b1 code optimize 2020-01-17 09:38:40 +08:00
378883b626 check url change before extract data 2020-01-16 15:11:49 +08:00
c78f593c70 code optimize 2020-01-16 09:59:19 +08:00
d82010686d Extractor.watch() improvements
- only watch current window
- stop watch on window close
- don't ask user to confirm when fails
2020-01-15 18:28:28 +08:00
7644a1363f Extractor.watch() 2020-01-15 17:53:23 +08:00
3338f78d91 code optimize 2020-01-15 15:21:17 +08:00
da7ae057f4 Extractor.stop() 2020-01-15 14:18:31 +08:00
2224db1ad1 incognito window first 2020-01-15 14:05:57 +08:00
790c95ffc3 clean state cache in 30 seconds 2020-01-14 17:03:14 +08:00
f06a6f4e78 migrate to typescript, with fixes 2020-01-14 16:37:50 +08:00
3d375261df fix task._data_keys not cleaned 2020-01-13 16:55:40 +08:00
13e233fbe7 allow user decision when some action fails 2020-01-13 16:47:52 +08:00
21d3dfb247 small fixes 2020-01-13 16:45:54 +08:00
97c8aac58d add logger 2020-01-13 14:27:40 +08:00
09112bb506 update documents 2020-01-12 16:54:24 +08:00
c7f4fe7cc4 save and load state 2020-01-12 16:19:38 +08:00
f1cf32b83a availablity check before run on active tab 2020-01-11 20:40:25 +08:00
341abebc66 scrollToBottom option 2020-01-11 20:00:52 +08:00
0cf04c3f79 keep state and continue 2020-01-11 09:02:12 +08:00
6134289d0a queryUrl expected url 2020-01-10 16:35:57 +08:00
0e62d914c1 runtime error do not interrupt promise chains 2020-01-10 16:34:46 +08:00
c504942144 wait for elements 2020-01-10 15:35:18 +08:00
4656e4ff64 helper function $ 2020-01-10 13:22:37 +08:00
26c6c1159e refactoring 2020-01-10 12:07:21 +08:00
35 changed files with 6119 additions and 660 deletions

154
.gitignore vendored
View File

@ -1,2 +1,154 @@
# File created using '.gitignore Generator' for Visual Studio Code: https://bit.ly/vscode-gig
# Created by https://www.gitignore.io/api/visualstudiocode,macos,node
# Edit at https://www.gitignore.io/?templates=visualstudiocode,macos,node
### macOS ###
# General
.DS_Store
Thumbs.db
.AppleDouble
.LSOverride
# Icon must end with two \r
Icon
# Thumbnails
._*
# Files that might appear in the root of a volume
.DocumentRevisions-V100
.fseventsd
.Spotlight-V100
.TemporaryItems
.Trashes
.VolumeIcon.icns
.com.apple.timemachine.donotpresent
# Directories potentially created on remote AFP share
.AppleDB
.AppleDesktop
Network Trash Folder
Temporary Items
.apdisk
### Node ###
# Logs
logs
*.log
npm-debug.log*
yarn-debug.log*
yarn-error.log*
lerna-debug.log*
# Diagnostic reports (https://nodejs.org/api/report.html)
report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json
# Runtime data
pids
*.pid
*.seed
*.pid.lock
# Directory for instrumented libs generated by jscoverage/JSCover
lib-cov
# Coverage directory used by tools like istanbul
coverage
*.lcov
# nyc test coverage
.nyc_output
# Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files)
.grunt
# Bower dependency directory (https://bower.io/)
bower_components
# node-waf configuration
.lock-wscript
# Compiled binary addons (https://nodejs.org/api/addons.html)
build/Release
# Dependency directories
node_modules/
jspm_packages/
# TypeScript v1 declaration files
typings/
# TypeScript cache
*.tsbuildinfo
# Optional npm cache directory
.npm
# Optional eslint cache
.eslintcache
# Optional REPL history
.node_repl_history
# Output of 'npm pack'
*.tgz
# Yarn Integrity file
.yarn-integrity
# dotenv environment variables file
.env
.env.test
# parcel-bundler cache (https://parceljs.org/)
.cache
# next.js build output
.next
# nuxt.js build output
.nuxt
# rollup.js default build output
dist/
# Uncomment the public line if your project uses Gatsby
# https://nextjs.org/blog/next-9-1#public-directory-support
# https://create-react-app.dev/docs/using-the-public-folder/#docsNav
# public
# Storybook build outputs
.out
.storybook-out
# vuepress build output
.vuepress/dist
# Serverless directories
.serverless/
# FuseBox cache
.fusebox/
# DynamoDB Local files
.dynamodb/
# Temporary folders
tmp/
temp/
### VisualStudioCode ###
.vscode/*
!.vscode/settings.json
!.vscode/tasks.json
!.vscode/launch.json
!.vscode/extensions.json
### VisualStudioCode Patch ###
# Ignore all local history of files
.history
# End of https://www.gitignore.io/api/visualstudiocode,macos,node
# Custom rules (everything added below won't be overriden by 'Generate .gitignore File' if you use 'Update' option)

Binary file not shown.

Before

Width:  |  Height:  |  Size: 24 KiB

4433
package-lock.json generated Normal file

File diff suppressed because it is too large Load Diff

17
package.json Normal file
View File

@ -0,0 +1,17 @@
{
"name": "data-extractor",
"scripts": {
"dev": "webpack --mode=development --devtool=inline-source-map --watch",
"prod": "webpack --mode=production"
},
"devDependencies": {
"@types/chrome": "0.0.91",
"@types/node": "^13.1.6",
"copy-webpack-plugin": "^5.1.1",
"ts-loader": "^6.2.1",
"tslint": "^5.20.1",
"typescript": "^3.7.4",
"webpack": "^4.41.5",
"webpack-cli": "^3.3.10"
}
}

View File

@ -1,14 +0,0 @@
$().ready(
() => {
$("#link-extension-detail").on('click', () => {
chrome.tabs.create({
'url': `chrome://extensions/?id=${chrome.runtime.id}`
});
})
$("#link-document").on('click', () => {
chrome.tabs.create({
'url': `https://git.jebbs.co/jebbs/data-extracter-extesion`
});
})
}
);

198
readme.md
View File

@ -5,130 +5,242 @@ DataExtracter helps you quickly extract data from any web pages.
All you need to do is:
- Find out the selectors (JQuery selectors) for target data
- Find out the selectors for target data
- Type scripts in the console of `extension backgroud page`, as introduced bellow.
![](images/console.png)
![](template/assets/console.png)
## Qucik Start
Extract current page
```js
new Extractor().task(".list-item", ["a.title", "p.content"]).start();
$('.item', ['a', 'a@href']);
new Extractor().task('.item', ['a', 'a@href']).start();
// fieldSelectors can be empty strings if items have no child to select
new Extractor().task('.item a', ['', '@href']).start();
```
> `$(...args)` is the short form of `new Extractor().task(...args).start();`, which is introduced later.
Extract multiple pages (1-10, interval 1)
```js
new Extractor().task(".list-item", ["a.title", "p.content"],"http://sample.com/?pn=${page}", 1, 10, 1).start();
$('.item', ['a', 'a@href'],"http://sample.com/?pn=${page}", 1, 10, 1);
```
Extract multiple urls (list)
```js
new Extractor().task(".list-item", ["a.title", "p.content"],["http://sample.com/abc","http://sample.com/xyz"]).start();
$('.item', ['a', 'a@href'],["http://sample.com/abc","http://sample.com/xyz"]);
```
Extract specified pages (1,3,5)
```js
new Extractor().task(".list-item", ["a.title", "p.content"], "http://sample.com/?pn=${page}", [1, 3, 5]).start();
$('.item', ['a', 'a@href'], "http://sample.com/?pn=${page}", [1, 3, 5]);
```
## Extractor.task() Signitures
## Task Call Signitures
```ts
// a task extracting data from current page
task(itemsSelector:string, fieldSelectors:string[])
// a task extracting data from a range of pages
task(itemsSelector:string, fieldSelectors:string[], urlTemplate:string, from:number, to:number, interval:number)
// a task extracting data from a list of pages
task(itemsSelector:string, fieldSelectors:string, urlTemplate:string, pages:number[])
// a task extracting data from a list of pages
task(itemsSelector:string, fieldSelectors:string[], urls:string[])
// a task extracting data of urls which extracted from last task result
task(itemsSelector:string, fieldSelectors:string[], urls:ExtractResult)
// extract data from current page
function (itemsSelector:string, fieldSelectors:string[])
// extract data from a range of pages
function (itemsSelector:string, fieldSelectors:string[], urlTemplate:string, from:number, to:number, interval:number)
// extract data from a list of pages
function (itemsSelector:string, fieldSelectors:string, urlTemplate:string, pages:number[])
// extract data from a list of pages
function (itemsSelector:string, fieldSelectors:string[], urls:string[])
// extract data of urls which extracted from last task result
function (itemsSelector:string, fieldSelectors:string[], urls:ExtractResult)
```
## Advanced Usage
## Stop Tasks
### Stop Tasks
Close the target tab, in which current tasks is running.
Tasks wait for their target elements' appearance, given some elements were loaded asynchronously.
Or use `job.stop()`:
But if you typed wrong selectors, the task waits forever for elements which don't exists.
```js
job = new Extractor().task('.search-list-item', ['a@href'], ["http://sample.com/abc"])
.task('list-item', ["a.title", "p.content"])
.start();
job.stop();
```
The only way to stop tasks before its finish, is `Closing the host tab`.
> Next time you call `job.start();`, the job will continues from where it stopped.
### Extract Attributes.
## Extract Attributes
e.g.: link text and target (use 'selector@attribute')
```js
new Extractor().task('.list-item', ['a.title', 'a.title@href']).start();
new Extractor().task('.item', ['a', 'a@href']).start();
```
## Click Selected Elements
The following clicks selected links and extracts link `text` and `href`
```js
new Extractor().task('.item', ['!a', 'a@href']).start();
```
## Advanced Usage
### Use Task Chain.
e.g.: Collect links from `http://sample.com/abc`, then, Extract data of each link
```js
new Extractor()
.task('.search-list-item', ['.item a@href'], ["http://sample.com/abc"])
e = new Extractor()
e.task('.search-list-item', ['a@href'], ["http://sample.com/abc"])
.task('list-item', ["a.title", "p.content"])
.start();
```
### Save Result of Any Task
### Extractor Options
To a multiple task (chain) Extractor `e`:
Specify extra options, to make task do some actions before scrape the data.
```js
var job = new Extractor({ "scrollToBottom": 1 });
```
Available options:
- `scrollToBottom`: Try scroll pages to the bottom, some elements are loaded only we user need them.
### Export Result of Any Task
To a multiple task Extractor `e`:
```js
e = new Extractor()
e.task('.search-list-item', ['.item a@href'], ["http://sample.com/abc"])
e.task('.search-list-item', ['a@href'], ["http://sample.com/abc"])
.task('list-item', ["a.title", "p.content"])
.start();
```
User will be asked to save the final result when it finishes.
User will be asked to export the final result when it finishes.
Incase you want to save it again, use:
Incase you want to export it again, use:
```js
e.save()
e.export()
```
You may want to save another task's result, other than the final:
To export another task result, other than the final one:
```js
// save the result of first task
// export the result of first task
// to the example above, that is a list of urls
e.save(1)
e.export(0)
// export the result of second task
e.export(1)
```
## Task Management
### Continue Tasks
Sometimes, it's hard to finish them in an single execution, that why we need "Continuing of Tasks".
You can always continue tasks by start it again, not matter in what phase it stops.
```js
e.start()
```
The `Extractor` kept the execution state, and starts from where it stopped.
### Restart Tasks
In cases some later task fails, you don't need to restart all task.
What if I don't like to continue from last state, but restart certain tasks?
Here we have 2 tasks:
```js
// restart all tasks
e.restart(0)
// restart from 2nd task
e.restart(1)
```
### Save & Load State
It may also be hard to finish tasks in even a single day, we need a way to save current state, and come back tommorow.
Create and run an extractor:
```js
e = new Extractor()
e.task('.search-list-item', ['.item a@href'], ["http://sample.com/abc"])
e.task('.search-list-item', ['a@href'], ["http://sample.com/abc"])
.task('list-item', ["a.title", "p.content"])
.start();
```
Suppose the second task fails, we can restart and continue from the task 2:
Save the state:
```js
e.restart(2);
e.save();
```
If you'd like restart all task, use:
Load the state:
Open the popup window, upload the saved state file. Then, and in the backgroud console:
```js
e = new Extractor().load();
e.start();
// or
e.restart();
```
> The uploaded state will be cleaned in 30 seconds, if you don't load it.
## Watch Mode
Watch mode tries to exract data from every page you visit **in current window**.
```js
e = new Extractor();
e.task('.search-list-item', ['a@href'], ["http://sample.com/abc"])
.task('list-item', ["a.title", "p.content"]);
e.watch(1); // start watching for first task
```
To stop watching, you can either `close current window`, or:
```js
e.stop();
```
## Results Operation
To get the results of a task:
```js
let results = job.results(0);
```
Visit URLs (if any) in the results one by one:
```js
results.visit();
```
Walk through all results one by one:
```js
results.walk((row,col,value)=>{console.log(value)});
```
## Developpment
Clone this project and execute:
```sh
npm i
npm run prod
# or
npm run dev
```

View File

@ -1,14 +0,0 @@
chrome.runtime.onMessage.addListener(function (message, sender, sendResponse) {
if (message.from === "DataExtracter:Extract") {
if (!testArgs(...message.args)) {
sendResponse(signitures);
return;
}
extract(...message.args).catch(
err => {
console.log(err);
alert(err);
}
);
}
});

View File

@ -1,58 +0,0 @@
function extract(...args) {
let message = {
from: "DataExtracter:Extract",
args: args
}
chrome.runtime.sendMessage(message, r => {
if (r) console.log(r);
});
}
chrome.runtime.onMessage.addListener(
function (request, sender, sendResponse) {
if (!request.from) return;
let [ext, act] = request.from.split(":");
if (ext.toLowerCase() !== 'dataextracter') return;
// console.log(request);
switch (act.toLowerCase()) {
case "extract":
let data = extractTabData(request.itemsSelector, request.fieldSelectors);
if (sendResponse) sendResponse(data);
break;
case "gotourl":
window.location.replace(request.url);
if (sendResponse) sendResponse(request.url);
break;
case "reportin":
if (sendResponse) sendResponse(request.from);
break;
case "queryurl":
if (sendResponse) sendResponse(window.location.href);
break;
default:
break;
}
}
);
function extractTabData(itemsSelector, fieldSelectors) {
let fieldNotFound = false;
let results = $(itemsSelector).toArray().map(
item => {
return fieldSelectors.map(
selector => {
let [cls, attr] = selector.split('@').slice(0, 2);
// TODO: close tab to cancel task tip
if (fieldNotFound) return;
let fieldVals = $(item).find(cls).toArray();
if (!fieldVals.length) {
fieldNotFound = true;
return;
}
return fieldVals.map(find => attr ? find[attr] : find.textContent.trim()).join('\n')
}
)
}
);
return fieldNotFound ? [] : results
}

View File

@ -1,221 +0,0 @@
/**
* Extract data from current tab / multiple urls.
* @param {string} itemsSelector items selectors for selecting items (data rows)
* @param {Array<string>} fieldSelectors fields selectors for selecting fields (data columns) under each item
* @param {...any} args url list / url templates, page numers, either [from, to, interval] or [...pages]
*/
async function extract(itemsSelector, fieldSelectors, ...args) {
let result = await getData(itemsSelector, fieldSelectors, ...args);
if (confirm(
`Click confirm to download if the sample data looks good (${result.data.length} items)\n\n${result.toString(50) || "- Empty -"}`
)) {
saveFile(result, "text/csv");
}
}
/**
* Extract data from current page / multiple urls.
* getData(tab, itemsSelector:string, fieldSelectors:string[])
* getData(tab, itemsSelector:string, fieldSelectors:string[], url:string, from:number, to:number, interval:number)
* getData(tab, itemsSelector:string, fieldSelectors:string, url:string, pages:number[])
* getData(tab, itemsSelector:string, fieldSelectors:string[], urls:string[])
* getData(tab, itemsSelector:string, fieldSelectors:string[], urls:ExtractResult)
* getData(itemsSelector:string, fieldSelectors:string[])
* getData(itemsSelector:string, fieldSelectors:string[], url:string, from:number, to:number, interval:number)
* getData(itemsSelector:string, fieldSelectors:string, url:string, pages:number[])
* getData(itemsSelector:string, fieldSelectors:string[], urls:string[])
* getData(itemsSelector:string, fieldSelectors:string[], urls:ExtractResult)
* @param {...any} args
*/
async function getData(...args) {
let tab;
if (typeof args[0] !== 'string') tab = args.shift();
if (!testArgs(...args))
throw new Error(`Invalid call arguments.\n\n${signitures}\n\n`);
itemsSelector = args.shift();
fieldSelectors = args.shift();
let urls = parseUrls(...args);
let data = [];
if (!tab) tab = await getActiveTab(true) || await getActiveTab(false);
if (!tab) throw new Error("Cannot find active tab.");
return new Promise((resolve, reject) => {
let pms;
if (urls.length) {
pms = urls.reduce((p, url) => p.then(
results => {
data.push(...results);
return redirectTab(tab, url).then(
() => extractTabData(tab, itemsSelector, fieldSelectors)
);
},
() => p
), Promise.resolve([]));
} else {
pms = extractTabData(tab, itemsSelector, fieldSelectors);
}
pms.then(
results => {
data.push(...results);
resolve(new ExtractResult(data));
},
err => reject(err)
);
});
}
function parseUrls(...args) {
if (!args.length) return [];
let arg = args.shift();
if (arg instanceof Array) {
return arg;
} else if (arg instanceof ExtractResult) {
return arg.squash().filter(v => !!v);
} else {
let urlTempl = arg;
if (urlTempl) {
if (args[0] instanceof Array) {
return args[0].map(p => urlTempl.replace("${page}", p));
} else if (args.length >= 3) {
let urls = [];
let from = args.shift();
let to = args.shift();
let interval = args.shift();
for (let i = from; i <= to; i += interval) {
urls.push(urlTempl.replace("${page}", i));
}
return urls;
}
}
}
return [];
}
function redirectTab(tab, url) {
let curUrl = "";
return queryUrl(tab, undefined, 'Query current url...')
.then(u => {
if (url !== u) {
curUrl = u;
let req = {
from: "GotoUrl",
url: url
}
sendMessage(tab, req, `Goto url: ${url}`);
}
})
.then(() => queryUrl(tab, curUrl, 'Check if tab url matches expected...'))
}
/**
* extract data in from the target tab.
* @param {any} tab target tab
* @param {string} itemsSelector items selectors for selecting items (data rows)
* @param {Array<string>} fieldSelectors fields selectors for selecting fields (data columns) under each item
* @returns {Promise<string[]>} a promise of extracted data
*/
function extractTabData(tab, itemsSelector, fieldSelectors) {
let req = {
from: "Extract",
itemsSelector: itemsSelector,
fieldSelectors: fieldSelectors
}
let cond = r => r && r.length;
return sendMessage(tab, req, 'Extract data from the tab...', cond);
}
/**
* get report in from the target tab, usually used to detect if the content script is ready.
* @param {any} tab target tab
* @returns {Promise<string>} a promise of the report in message
*/
function reportIn(tab) {
let req = {
from: "ReportIn"
}
let cond = r => r == req.from;
return sendMessage(tab, req, 'Check tab availability...', cond);
}
/**
* get the url of the target tab
* @param {any} tab target tab
* @param {string} urlExcluded if specified, queryUrl resolves only when response not equals to urlExcluded
* @returns {Promise<string>} a promise of the url
*/
function queryUrl(tab, urlExcluded, log) {
let req = {
from: "QueryUrl"
}
let cond = url => url && (!urlExcluded || (urlExcluded && urlExcluded != url));
return sendMessage(tab, req, log, cond);
}
/**
* Repeatedly sending a message to target tab until the response is detected good.
* @param {object} tab the table where to send the message
* @param {object} req the request data.
* @param {function} cond success condition function, r:any=>boolean
* @param {number} interval interval for detecting
* @param {string} log messages logged to console.
* @return {Promise} a promise of the response.
*/
function sendMessage(tab, req, log, cond, interval) {
req.from = "DataExtracter:" + req.from;
interval = interval || 500;
return new Promise((resolve, reject) => {
loop();
async function loop() {
// console.log("request for", req.from);
let tabAvailable = await getTabByID(tab.id);
if (!tabAvailable) {
reject("Task interrupted due to the target tab is closed.");
return;
}
chrome.tabs.sendMessage(tab.id, req, r => {
let flag = !cond || cond(r);
if (log) console.log(log, flag ? '(OK)' : '(failed)');
if (flag) {
resolve(r);
} else {
setTimeout(() => {
loop();
}, interval);
}
});
}
});
}
async function createTab(url, active) {
return new Promise((resolve, reject) => {
chrome.tabs.create({
'url': url,
'active': active
}, function (tab) {
resolve(tab);
})
})
}
async function getActiveTab(currentWindow) {
return new Promise((resolve, reject) => {
chrome.tabs.query({
active: true,
currentWindow: currentWindow
}, function (tabs) {
resolve(tabs[0]);
})
})
}
async function getTabByID(id) {
return new Promise((resolve, reject) => {
chrome.tabs.get(id, function (tab) {
chrome.runtime.lastError;
resolve(tab);
})
})
}

View File

@ -1,143 +0,0 @@
class Extractor {
constructor() {
this._tasks = [];
this._tab = undefined;
this._running = false;
this._results = {};
}
/**
* Add a task to Extractor. \n
* One Extractor could has multiple tasks, which orgnized in a task chian.
* Later task will use previous task result as input (target url list).
* So only the first task can have target url arguments, while later tasks can't.
* @param {...any} args itemsSelector, fieldSelectors, and more args to specify target urls.
*/
task(...args) {
if (!testArgs(...args)) {
console.log(`Invalid call arguments.\n\n${argsToString(...args)}\n${signitures}\n`);
// break call chain to avoid unexpected task running
return undefined;
}
// given >2 arguments means the task specifies target page,
// so it won't accept last task result as url list.
// in this case, former tasks are useless, can be cleared.
if (args.length > 2) this.clear();
this._tasks.push(args);
return this;
}
/**
* Clear tasks and caches.
*/
clear() {
this._tasks = [];
this._results = [];
}
/**
* Start the task chain.
*/
async start() {
if (this._running) {
console.log('The Extractor is running. Please wait..');
return;
}
if (!this._tasks.length) {
console.log('No task to run.');
return;
}
let firstTaskArgs = this._tasks[0];
if (firstTaskArgs.length > 2) {
// task specifies target urls, create new tab with first url for it
let urls = parseUrls(...firstTaskArgs.slice(2, firstTaskArgs.length));
this._tab = await createTab(urls[0], false);
} else {
this._tab = await getActiveTab(false);
}
this._running = true;
return this._tasks.reduce((pms, args, i, tasks) => {
return pms.then(
result => {
if (result === undefined) return getData(this._tab, ...args);
this._results[tasks[i - 1]] = result;
return getData(this._tab, ...args, result);
});
}, Promise.resolve(undefined)).then(
result => {
this._results[this._tasks[this._tasks.length - 1]] = result;
this._running = false;
this.save();
}
).catch(err => {
this._running = false;
console.log(err)
});
}
/**
* restart from specified task, but don't restart the previous tasks.
* @param {number} taskid from which restart the tasks
*/
async restart(taskid) {
if (this._running) {
console.log('The Extractor is running. Please wait..');
return;
}
taskid = this._checkTaskId(taskid, 1);
if (!taskid) return;
if (taskid == 1) {
this.start();
return;
}
let cache = this._results[this._tasks[taskid - 2]];
if (!cache) {
console.log(`No result cache for task (id ${taskid}). \nMake sure call ".start()" before ".restart()"?`);
return;
}
this._running = true;
this._tab = await createTab(parseUrls(cache)[0], false)
return this._tasks.slice(taskid - 1).reduce((pms, args, i, tasks) => {
return pms.then(
result => {
this._results[tasks[i - 1]] = result;
return getData(this._tab, ...args, result);
});
}, Promise.resolve(cache)).then(
result => {
this._results[this._tasks[this._tasks.length - 1]] = result;
this._running = false;
this.save();
}
).catch(err => {
this._running = false;
console.log(err)
});
}
/**
* Save result of a task
* @param {number} taskid which task id to save.
*/
save(taskid) {
taskid = this._checkTaskId(taskid, this._tasks.length);
if (!taskid) return;
const result = this._results[this._tasks[taskid - 1]];
if (!result) {
console.log(`No task result for id (${taskid}). Forget to call ".start()"?`);
return;
}
if (confirm(
`Click confirm to download if the sample data looks good (${result.data.length} items)\n\n${result.toString(50) || "- Empty -"}`
)) {
saveFile(result, "text/csv");
}
}
_checkTaskId(id, defaultId) {
if (!this._tasks.length) {
console.log("No task found.");
return 0;
}
if (defaultId && id === undefined || this.task === null) id = defaultId;
if (isNaN(id) || id < 1 || id > this._tasks.length) {
console.log(`Invalid task id. Rang(1-${this._tasks.length})`);
return 0;
}
return id
}
}

File diff suppressed because one or more lines are too long

View File

@ -1,34 +0,0 @@
class ExtractResult {
constructor(data) {
this._data = data || [];
}
row(index) {
return this._data[index];
}
column(index) {
return [...new Array(this._data.length).keys()].map(
i => this._data[i][index]
);
}
squash() {
return this._data.reduce((p, c) => p.concat(c), []);
}
get data() {
return this._data;
}
toString(rowsCount) {
let data = rowsCount > 0 ? this._data.slice(0, rowsCount) : this._data;
return data.slice().reduce(
(csv, lineCells) => {
let line = lineCells.reduce(
(lineText, cell, idx) => {
cell = '"' + cell.trim().replace(/"/g, '""') + '"';
return lineText + cell + (idx == lineCells.length - 1 ? "" : ",")
}, "");
return csv + line + "\n";
},
""
);
}
}

View File

@ -1,99 +0,0 @@
const signitures = `
## Usage
new Extractor().task(...args).task(...args).start();
## Extractor.task() Signitures:
----------------------------
task(itemsSelector:string, fieldSelectors:string[])
task(itemsSelector:string, fieldSelectors:string[], url:string, from:number, to:number, interval:number)
task(itemsSelector:string, fieldSelectors:string, url:string, pages:number[])
task(itemsSelector:string, fieldSelectors:string[], urls:string[])
task(itemsSelector:string, fieldSelectors:string[], urls:ExtractResult)
## See Detailed Help:
https://git.jebbs.co/jebbs/data-extracter-extesion
`.trim();
function saveFile(data, mimeType, fileName) {
fileName = fileName || document.title || "result";
var blob;
if (typeof window.Blob == "function") {
blob = new Blob([data], {
type: mimeType
})
} else {
var BlobBuiler = window.BlobBuilder || window.MozBlobBuilder || window.WebKitBlobBuilder || window.MSBlobBuilder;
var builer = new BlobBuiler();
builer.append(data);
blob = builer.getBlob(mimeType)
}
var URL = window.URL || window.webkitURL;
var url = URL.createObjectURL(blob);
var link = document.createElement("a");
if ('download' in link) {
link.style.visibility = "hidden";
link.href = url;
link.download = fileName;
document.body.appendChild(link);
var j = document.createEvent("MouseEvents");
j.initEvent("click", true, true);
link.dispatchEvent(j);
document.body.removeChild(link)
} else if (navigator.msSaveBlob) {
navigator.msSaveBlob(blob, fileName)
} else {
location.href = url
}
}
function testArgs(...args) {
switch (args.length) {
case 0, 1:
return false;
case 2:
return args[0] && args[1] &&
(typeof args[0] == "string") &&
(args[1] instanceof Array) &&
testArrayVals(args[1], v => typeof v == "string");
case 3:
return args[0] && args[1] &&
typeof args[0] == "string" &&
args[1] instanceof Array &&
testArrayVals(args[1], v => typeof v == "string") &&
(
(
args[2] instanceof Array &&
testArrayVals(args[2], v => typeof v == "string")
) || (
args[2] instanceof ExtractResult
)
);
case 4:
return args[0] && args[1] &&
typeof args[0] == "string" &&
args[1] instanceof Array &&
testArrayVals(args[1], v => typeof v == "string") &&
typeof args[2] == "string" &&
args[3] instanceof Array &&
testArrayVals(args[3], v => typeof v == "number");
case 6:
return args[0] && args[1] &&
typeof args[0] == "string" &&
args[1] instanceof Array &&
testArrayVals(args[1], v => typeof v == "string") &&
typeof args[2] == "string" &&
!isNaN(args[3]) && !isNaN(args[4]) && !isNaN(args[5]);
default:
return false;
}
function testArrayVals(arr, tester) {
return arr.reduce((p, c) => p && tester(c), true);
}
}
function argsToString(...args) {
return args.map(v => (v instanceof Array ? `[${v.join(', ')}]` : v.toString())).join(', ');
}

202
src/background/actions.ts Normal file
View File

@ -0,0 +1,202 @@
import { Actions, Request } from "../common";
import { sendMessage, ResponseChecker } from "./messaging";
import { logger } from "../common/logger";
/**
* redirect tab to url.
* @param {any} tab target tab
* @param {string} url target URL
* @returns {Promise<string[]>} a promise of target URL
*/
export function redirectTab(tab: chrome.tabs.Tab, url: string, check?: boolean) {
return queryUrl(tab).then(u => {
if (url !== u) {
let req: Request = {
action: Actions.GOTO_URL,
url: url
}
let checker: ResponseChecker<string> = !check ? undefined : async (r, err, tryCount): Promise<string> => {
let queryErr: any;
let newURL = await queryUrl(tab).catch(e => queryErr = e);
if (queryErr) {
throw queryErr;
}
if (newURL == url) return url;
if (
confirm(`Cannot navigate to target url.
expected: ${url}\n
actual: ${newURL}\n
Press OK to continue, Cancel to retry. Close the tab to stop`)
) {
return newURL;
}
return undefined;
}
return sendMessage<string>(tab, req, `Goto url: ${url}`, checker);
}
});
}
/**
* extract data in from the target tab.
* @param {any} tab target tab
* @param {string} itemsSelector items selectors for selecting items (data rows)
* @param {Array<string>} fieldSelectors fields selectors for selecting fields (data columns) under each item
* @returns {Promise<string[]>} a promise of extracted data
*/
export function extractTabData(tab: chrome.tabs.Tab, itemsSelector: string, fieldSelectors: string[], expectedURL?: string, askOnfail?: boolean) {
let req: Request = {
action: Actions.EXTRACT,
itemsSelector: itemsSelector,
fieldSelectors: fieldSelectors,
url: expectedURL,
}
let checker: ResponseChecker<string[][]> = (response, err, tryCount) => {
if (response.error) throw response.error;
let result = response.result;
if (!result || !result.length) {
if (
tryCount % 20 == 0 && (
!askOnfail ||
confirm('No data found in current page. \n\nContinue to next page?')
)
) {
logger.warn(`Failed after ${tryCount} tries: ${tab.url}`)
return [];
} else {
return undefined;
}
}
return result;
};
return sendMessage<string[][]>(tab, req, 'Extract data from the tab...', checker);
}
/**
* ping target tab, usually used to detect if the content script is ready.
* @param {any} tab target tab
* @returns {Promise<boolean>} a promise of boolean value indicates if ping success
*/
export async function ping(tab, count = 1) {
let req = {
action: Actions.PING
}
let checker: ResponseChecker<string> = (r, e, c) =>
r.result == "pong" ? r.result : undefined;
let pong = await sendMessage<string>(tab, req, 'Check tab availability...', checker, 1000, 1000, count).catch(() => { });
return pong == "pong";
}
/**
* get the url of the target tab
* @param {any} tab target tab
* @returns {Promise<string>} a promise of the url
*/
export function queryUrl(tab: chrome.tabs.Tab) {
let req = {
action: Actions.QUERY_URL
}
return sendMessage<string>(tab, req);
}
/**
* get the url of the target tab
* @param {any} tab target tab
* @param {string} expected if specified, queryUrl resolves only when tab url equals to expected
* @returns {Promise<string>} a promise of the url
*/
export function scrollToBottom(tab: chrome.tabs.Tab) {
let req = {
action: Actions.SCROLL_BOTTOM
}
return sendMessage(tab, req, 'Scroll to page bottom...');
}
export async function createTab(url: string, active: boolean): Promise<chrome.tabs.Tab> {
return new Promise((resolve, reject) => {
findIncognitoWindow().then(
incognitoWindow => {
chrome.tabs.create({
'url': url,
'active': active,
// createTab to incognito window first
'windowId': incognitoWindow ? incognitoWindow.id : undefined
}, function (tab) {
resolve(tab);
})
}
);
});
}
export async function findIncognitoWindow(): Promise<chrome.windows.Window> {
return new Promise((resolve, reject) => {
chrome.windows.getAll(
{
windowTypes: ['normal'],
},
(windows: chrome.windows.Window[]) => {
for (let window of windows) {
if (window.incognito) {
resolve(window);
return;
}
}
resolve(undefined);
}
);
});
}
export async function getCurrentWindow(): Promise<chrome.windows.Window> {
return new Promise((resolve, reject) => {
chrome.windows.getCurrent(
(windows: chrome.windows.Window) => {
return resolve(windows);
}
);
});
}
export async function getWindowByID(id: number) {
return new Promise<chrome.windows.Window>((resolve, reject) => {
chrome.windows.get(id, function (window) {
chrome.runtime.lastError;
resolve(window);
})
})
}
export async function CreateIncognitoWindow() {
return new Promise((resolve, reject) => {
chrome.windows.create(
<chrome.windows.CreateData>{
incognito: true,
},
(window: chrome.windows.Window) => {
resolve(window);
}
);
});
}
export async function getActiveTab(currentWindow: boolean): Promise<chrome.tabs.Tab> {
return new Promise((resolve, reject) => {
chrome.tabs.query({
active: true,
currentWindow: currentWindow
}, function (tabs) {
resolve(tabs[0]);
})
})
}
export async function getTabByID(id: number): Promise<chrome.tabs.Tab> {
return new Promise((resolve, reject) => {
chrome.tabs.get(id, function (tab) {
chrome.runtime.lastError;
resolve(tab);
})
})
}

31
src/background/caches.ts Normal file
View File

@ -0,0 +1,31 @@
import { logger } from "../common/logger";
import { Actions } from "../common";
import { messageSubscribers } from "./messaging";
export class Caches {
private _state: string = "";
constructor() {
messageSubscribers.addListener(Actions.UPLOAD_STATE, (request, sender, sendResponse) => {
sendResponse('recieved!');
this.setState(request.fileName, request.state)
});
}
get state(): string {
let s = this._state;
this._state = "";
return s;
}
setState(name: string, content: string) {
this._state = content;
logger.info(`State (${name}) recieved. To load it: some_var = new Extractor().load()`);
// clear cache in 30 seconds
setTimeout(() => {
if (this._state) {
logger.info(`Uploaded state is cleaned after 30 second.`);
this._state = "";
}
}, 30000);
}
}
export const caches = new Caches();

184
src/background/extractor.ts Normal file
View File

@ -0,0 +1,184 @@
import { Task } from "./task";
import { parseUrls, saveFile } from "./tools";
import { createTab, getActiveTab, ping, redirectTab } from "./actions";
import { logger } from "../common/logger";
import { caches } from "./caches";
import { ExtractResult } from "./result";
export class Extractor {
private _tasks: Task[] = [];
private _running = false;
private _options: any = {};
constructor(options?) {
if (options) this._options = options;
}
static async ping(count: number = 1) {
let tab = await getActiveTab(true) || await getActiveTab(false);
let succ = await ping(tab, count);
if (!succ) {
logger.error('Cannot contact with active tab.');
return;
}
}
/**
* Save current state, in case we restore it later.
*/
save() {
saveFile(JSON.stringify(this), 'application/json', 'state.json');
}
/**
* Restore previous state by loading from saved state.
*/
load() {
let content = caches.state;
if (!content) {
logger.info('No state found. Please upload a saved state from the popup window first.');
return;
}
let state = JSON.parse(content);
this._options = state._options;
this._tasks = state._tasks.map(t => new Task(this._options, 'whaterver', ['whaterver']).load(t));
return this;
}
/**
* Add a task to Extractor. \n
* One Extractor could has multiple tasks, which orgnized in a task chian.
* If url arguments not given within later tasks, they will use previous task result as input (target url list).
* @param {...any} args itemsSelector, fieldSelectors, and more args to specify target urls.
*/
task(...args: any) {
this._tasks.push(new Task(this._options, ...args));
return this;
}
/**
* Add a task to Extractor. \n
* One Extractor could has multiple tasks, which orgnized in a task chian.
* If url arguments not given within later tasks, they will use previous task result as input (target url list).
* @param {...any} args itemsSelector, fieldSelectors, and more args to specify target urls.
*/
results(id?: number): ExtractResult {
id = this._checkTaskId(id);
if (id < 0) return;
return this._tasks[id].results;
}
/**
* Clear tasks and task caches.
*/
clear() {
this._tasks = [];
return this;
}
/**
* Start the task chain.
*/
start() {
return this._startTasks(0);
}
stop(id?: number) {
if (id !== undefined) {
id = this._checkTaskId(id);
if (id < 0) return;
this._tasks[id].stop();
return;
}
for (let i = 0; i < this._tasks.length; i++) {
this._tasks[i].stop();
}
}
watch(id: number) {
id = this._checkTaskId(id);
if (id < 0) return;
this._tasks[id].watch();
}
/**
* restart from specified task, but don't restart the previous tasks.
* @param {number} from where to restart the tasks, begins with 0
*/
restart(from: number = 0) {
let id = this._checkTaskId(from, 0);
if (id < 0) return;
for (let i = id; i < this._tasks.length; i++) {
this._tasks[i].clean();
}
return this._startTasks(0);
}
async _startTasks(from: number) {
if (this._running) {
logger.info('The Extractor is running. Please wait..');
return;
}
if (!this._tasks.length) {
logger.info('No task to run.');
return;
}
let tab;
let task = this._tasks[0];
if (task.urls.length) {
// task specifies target urls, create new tab with first url for it
tab = await createTab(task.urls[0], false);
} else {
tab = await getActiveTab(true) || await getActiveTab(false);
let succ = await ping(tab);
if (!succ) {
logger.error('Cannot contact with active tab.');
return;
}
}
this._running = true;
return this._tasks.reduce((pms, task: Task, i: number) => {
return pms.then(
() => {
if (i < from) return;
if (i > 0) {
let prevTask = this._tasks[i - 1];
return task.execute(tab, prevTask.results);
}
return task.execute(tab);
});
}, Promise.resolve<void>(undefined)).then(
() => {
this._running = false;
this.export();
}
).catch(err => {
this._running = false;
logger.error(err);
});
}
/**
* export result of a task to CSV
* @param {number} taskid which task id to save, begins with 0
*/
export(taskid?: number) {
let id = this._checkTaskId(taskid, this._tasks.length - 1);
if (id < 0) return;
let results = this._tasks[id].results
let count = results.data.length
if (!count) {
logger.info(`No result for task #${id}. Forget to call ".start()"?`);
return;
}
results.header = this._tasks[id].fieldSelectors;
let msg = `
Please confirm to download (${count} items)
${results.toString(50) || "- Empty -"}
`.trim();
if (confirm(msg)) {
saveFile(results.toString(), "text/csv");
}
}
private _checkTaskId(id: number, defaultId?: number) {
if (!this._tasks.length) {
logger.info("No task found.");
return -1;
}
if (!isNaN(defaultId) && id === undefined) id = defaultId;
if (isNaN(id) || id < 0 || id >= this._tasks.length) {
logger.info(`Invalid task id. Rang(0-${this._tasks.length - 1})`);
return -1;
}
return id
}
}

14
src/background/index.ts Normal file
View File

@ -0,0 +1,14 @@
import { Extractor } from "./extractor";
declare global {
interface Window {
$: (...args: any) => void;
Extractor: any;
}
}
window.$ = function (...args) {
return new Extractor().task(...args).start();
}
window.Extractor = Extractor;

150
src/background/messaging.ts Normal file
View File

@ -0,0 +1,150 @@
import { Request, Actions, Response } from "../common";
import { getTabByID } from "./actions";
import { logger } from "../common/logger";
export type ResponseCheckerSync<T> = (r: Response<T>, err: chrome.runtime.LastError, count: number) => T;
export type ResponseCheckerAsync<T> = (r: Response<T>, err: chrome.runtime.LastError, count: number) => Promise<T>;
export type ResponseChecker<T> = ResponseCheckerSync<T> | ResponseCheckerAsync<T>;
/**
* Sending a message to target tab repeatedly until the response is not undefined.
* @param {object} tab the table where to send the message
* @param {object} req the request data.
* @param {function} dataChecker (reulst:any, err:error, tryCount:number) => any.
* Check and decide what value finally returns.
* Return undefined to make 'sendMessage' retry.
* Return MSG_USER_ABORT to cancel this promise.
* @param {number} interval retry interval, default: 500ms.
* @param {number} limit retry limit, default: 0, no limit.
* @param {string} log messages logged to console.
* @return {Promise} a promise of the response.
*/
export function sendMessage<T>(
tab: chrome.tabs.Tab,
req: Request,
log?: string,
dataChecker?: ResponseChecker<T>,
timeout?: number,
interval?: number,
limit?: number
) {
timeout = timeout || 10;
interval = interval || 500;
limit = isNaN(limit) ? 0 : limit;
let count = 0;
return new Promise<T>((resolve, reject) => {
loop();
async function loop() {
logger.debug("Request for", Actions[req.action]);
let tabAvailable = await getTabByID(tab.id);
if (!tabAvailable) {
reject("Task interrupted due to the target tab is closed.");
return;
}
if (limit && count >= limit) {
reject(`sendMessage loop limit ${limit} reached.`);
return;
}
count++;
let timeout = setTimeout(() => { reject(`${Actions[req.action]} requset timeout after ${timeout}s`) }, 10000);
chrome.tabs.sendMessage(tab.id, req, async (r: Response<T>) => {
clearTimeout(timeout);
// check error but do nothing until dataChecker.
let err = chrome.runtime.lastError;
let [result, error] = await checkResponse(dataChecker, r, err, count);
if (error) {
reject(error);
return;
}
let flag = result !== undefined;
if (log) logger.info(log, flag ? '(OK)' : '(failed)');
if (flag) {
resolve(result);
} else {
setTimeout(() => {
logger.debug('Invalid response', r, 'retry...');
loop();
}, interval);
}
});
}
});
}
async function checkResponse<T>(
dataChecker: ResponseChecker<T>,
response: Response<T>,
error: chrome.runtime.LastError,
tryCount: number
): Promise<[T, string]> {
// response could be undefined if the content script is interrupted.
// don't check, tell sendMessage to retry.
if (!response) return [undefined, undefined];
if (!dataChecker) {
return [response.result, response.error];
}
let result: T;
let pms: T | Promise<T>;
try {
pms = dataChecker(response, error, tryCount);
} catch (err) {
return [undefined, err];
}
// don't catch if it's not a Promise
if (pms instanceof Promise) {
let checkerError: any;
pms = pms.catch(e => checkerError = e);
result = await pms;
if (checkerError) {
return [undefined, checkerError];
}
} else {
result = pms;
}
return [result, undefined];
}
export type ActionSubscriberSync = (request: Request, sender: chrome.runtime.MessageSender, sendResponse: (response?: any) => void) => void;
export type ActionSubscriberAsync = (request: Request, sender: chrome.runtime.MessageSender, sendResponse: (response?: any) => void) => Promise<void>;
export type ActionSubscriber = ActionSubscriberSync | ActionSubscriberAsync;
class MessageSubscribers {
private listeners: { [key: number]: ActionSubscriber[] } = {};
addListener(action: Actions, subscriber: ActionSubscriber) {
this.listeners[action] || (this.listeners[action] = []);
this.listeners[action].push(subscriber);
}
removeListener(action: Actions, subscriber: ActionSubscriber) {
this.listeners[action] || (this.listeners[action] = []);
for (let i = 0; i < this.listeners[action].length; i++) {
if (this.listeners[action][i] == subscriber) {
this.listeners[action].splice(i, 1);
i--;
}
}
logger.debug(`${this.listeners[action].length} subscriber(s) remained for action ${Actions[action]}`);
}
getListeners(action: Actions): ActionSubscriber[] {
return this.listeners[action]
}
}
export const messageSubscribers = new MessageSubscribers();
chrome.runtime.onMessage.addListener(function (request: Request, sender, sendResponse) {
let subscribers = messageSubscribers.getListeners(request.action);
if (!subscribers || !subscribers.length) {
sendResponse("Request not supported.");
return;
}
let promises: Promise<any>[] = [];
for (let subscriber of subscribers) {
let p = subscriber(request, sender, sendResponse);
if (p instanceof Promise) promises.push(p);
}
if (promises.length)
return Promise.all(promises);
return;
});

85
src/background/result.ts Normal file
View File

@ -0,0 +1,85 @@
import { logger } from "../common/logger";
import { getActiveTab, ping, redirectTab } from "./actions";
import { parseUrls } from "./tools";
export class ExtractResult {
private _header: string[];
private _data: string[][] = [];
constructor(data: string[][]) {
this._data = data || [];
}
row(index: number): string[] {
return this._data[index];
}
column(index: number): string[] {
return [...new Array(this._data.length).keys()].map(
i => this._data[i][index]
);
}
squash(): string[] {
return this._data.reduce((p, c) => p.concat(c), []);
}
set header(h: string[]) {
this._header = h
}
get data(): string[][] {
return this._data;
}
toString(rowsCount: number = 0): string {
let data = rowsCount > 0 ? this._data.slice(0, rowsCount) : this._data;
if (this._header && this._header.length) {
data.unshift(this._header);
}
return data.slice().reduce(
(csv, lineCells) => {
if (!lineCells || !lineCells.length) {
return csv + "\n";
}
let line = lineCells.reduce(
(lineText, cell, idx) => {
cell = cell || "";
cell = '"' + cell.trim().replace(/"/g, '""') + '"';
return lineText + cell + (idx == lineCells.length - 1 ? "" : ",")
}, "");
return csv + line + "\n";
},
""
);
}
async walk(fn: (row: number, col: number, value: string) => void) {
let pms = Promise.resolve(null);
for (let i = 0; i < this._data.length; i++) {
let cells = this._data[i];
for (let j = 0; j < cells.length; j++) {
let row = i;
let col = j;
let value = cells[j];
pms = pms.then(
() => fn(row, col, value)
)
}
}
return pms.catch(err => {
logger.error(err);
});
}
async visit() {
let urls = parseUrls(this);
let tab = await getActiveTab(true) || await getActiveTab(false);
let succ = await ping(tab);
if (!succ) {
logger.error('Cannot contact with active tab.');
return;
}
return urls.reduce(
(pms, url: string, i: number) => {
return pms.then(
async () => {
return redirectTab(tab, url, false);
});
}, Promise.resolve<void>(undefined)
).catch(err => {
logger.error(err);
});
}
}

View File

@ -0,0 +1,71 @@
import { ExtractResult } from "./result";
export const signitures = `
## Usage
// single task
$(...args);
// managed task chains
e = new Extractor();
e.task(...args).task(...args).start();
## Task Call Signitures:
function(itemsSelector:string, fieldSelectors:string[]);
function(itemsSelector:string, fieldSelectors:string[], url:string, from:number, to:number, interval:number);
function(itemsSelector:string, fieldSelectors:string[], url:string, pages:number[]);
function(itemsSelector:string, fieldSelectors:string[], urls:string[]);
## Example:
// extract all links text & url under '.item' elements
// use 'selector@attr' to get attribute of the field elements
$(".item", ["a", "a@href"]);
## See Detailed Help:
https://git.qjebbs.com/jebbs/data-extracter-extesion
`.trim();
export function testArgs(...args: any) {
switch (args.length) {
case 0:
case 1:
return false;
case 2:
return args[0] && args[1] &&
(typeof args[0] == "string") &&
(args[1] instanceof Array) &&
testArrayVals(args[1], v => typeof v == "string");
case 3:
return args[0] && args[1] &&
typeof args[0] == "string" &&
args[1] instanceof Array &&
testArrayVals(args[1], v => typeof v == "string") &&
(
(
args[2] instanceof Array &&
testArrayVals(args[2], v => typeof v == "string")
) || (
args[2] instanceof ExtractResult
)
);
case 4:
return args[0] && args[1] &&
typeof args[0] == "string" &&
args[1] instanceof Array &&
testArrayVals(args[1], v => typeof v == "string") &&
typeof args[2] == "string" &&
args[3] instanceof Array &&
testArrayVals(args[3], v => typeof v == "number");
case 6:
return args[0] && args[1] &&
typeof args[0] == "string" &&
args[1] instanceof Array &&
testArrayVals(args[1], v => typeof v == "string") &&
typeof args[2] == "string" &&
!isNaN(args[3]) && !isNaN(args[4]) && !isNaN(args[5]);
default:
return false;
}
function testArrayVals(arr, tester) {
return arr.reduce((p, c) => p && tester(c), true);
}
}

178
src/background/task.ts Normal file
View File

@ -0,0 +1,178 @@
import { parseUrls } from "./tools";
import { queryUrl, redirectTab, scrollToBottom, extractTabData, findIncognitoWindow, getCurrentWindow, getWindowByID } from "./actions";
import { testArgs, signitures } from "./signiture";
import { ExtractResult } from "./result";
import { messageSubscribers, ActionSubscriber } from "./messaging";
import { Actions } from "../common";
import { logger } from "../common/logger";
export class Task {
private _data: { [key: string]: string[][] } = {};
private _data_keys: string[] = [];
private _options: any;
private _itemsSelector: string;
private _fieldSelectors: string[];
private _urls: string[] = [];
private _running = false;
private _listeners: ActionSubscriber[] = [];
constructor(options: any, ...arg: any);
constructor(options: any, itemsSelector: string, fieldSelectors: string[]);
constructor(options: any, itemsSelector: string, fieldSelectors: string[], url: string, from: number, to: number, interval: number);
constructor(options: any, itemsSelector: string, fieldSelectors: string[], url: string, pages: number[]);
constructor(options: any, itemsSelector: string, fieldSelectors: string[], urls: string[]);
constructor(options, ...args) {
if (!testArgs(...args))
throw new Error(`Invalid call arguments.\n\n${signitures}\n\n`);
this._options = options;
this._itemsSelector = args.shift();
this._fieldSelectors = args.shift();
this._urls = parseUrls(...args);
}
load(state: any): Task {
this._itemsSelector = state._itemsSelector;
this._data = state._data;
this._data_keys = state._data_keys;
this._itemsSelector = state._itemsSelector;
this._fieldSelectors = state._fieldSelectors;
this._urls = state._urls;
return this;
}
get urls(): string[] {
return this._urls;
}
get results(): ExtractResult {
let rs: string[][] = this._data_keys.reduce((p, c) => {
return p.concat(this._data[c]);
}, []);
return new ExtractResult(rs);
}
get fieldSelectors(): string[] {
return this._fieldSelectors;
}
clean(): Task {
this.stop();
this._data = {};
this._data_keys = [];
return this;
}
stop() {
this._running = false;
let listener: ActionSubscriber;
while (listener = this._listeners.pop()) {
messageSubscribers.removeListener(Actions.REPORT_NEW_PAGE, listener);
}
}
async watch() {
if (this._running) {
logger.info("The task is running. Please wait...");
return;
}
this._running = true;
let window = await findIncognitoWindow() || await getCurrentWindow();
if (!window) {
logger.info("No window to watch...");
return;
}
let watchTaskID = 0;
let listener: ActionSubscriber = async (request, sender, sendResponse) => {
let findWindow = await getWindowByID(window.id);
if (!findWindow) {
// stop watch on window close.
messageSubscribers.removeListener(Actions.REPORT_NEW_PAGE, listener);
return;
}
// only watch current window.
if (sender.tab.windowId != window.id) return;
let taskID = watchTaskID++;
logger.info(`Watcher #${taskID} starts.`);
let pm = this.makeOptionalTasks(sender.tab);
return pm.then(
() => extractTabData(sender.tab, this._itemsSelector, this._fieldSelectors, sender.tab.url, true)
).then(
results => {
if (results && results.length) {
this.saveResult(results, sender.tab.url);
}
logger.info(`Watcher #${taskID} ends.`);
}
).catch(
e => logger.error(`Watcher #${taskID} ends with:`, e)
)
}
this._listeners.push(listener);
messageSubscribers.addListener(Actions.REPORT_NEW_PAGE, listener);
}
async execute(tab: chrome.tabs.Tab, upstreamData?: ExtractResult): Promise<void> {
if (!tab) throw "No tab to execute the task.";
if (this._running) throw "The task is running. Please wait...";
this._running = true;
let urls = this._urls
if (!urls.length) {
if (upstreamData) {
urls = parseUrls(upstreamData);
} else {
let tabURL: string;
await queryUrl(tab)
.then(u => {
tabURL = u;
})
.catch(() => {
e => {
this._running = false;
return Promise.reject(e);
}
});
urls = [tabURL];
}
}
return urls.reduce((p, url, i) => p.then(
results => {
if (i > 0 && results instanceof Array) {
let lastURL = urls[i - 1];
this.saveResult(results, lastURL);
}
if (this._data[url]) return;
let pms: Promise<any> = this.runningCheck(() => redirectTab(tab, url));
return pms
.then(() => this.makeOptionalTasks(tab))
.then(
() => this.runningCheck(() => extractTabData(tab, this._itemsSelector, this._fieldSelectors))
);
}
), Promise.resolve<string[][]>(null)).then(
results => {
if (results && results.length) {
let lastURL = urls[urls.length - 1];
this.saveResult(results, lastURL);
}
this._running = false;
}
).catch(
e => {
this._running = false;
throw e;
}
);
}
private makeOptionalTasks(tab: chrome.tabs.Tab): Promise<any> {
let pm: Promise<any>;
if (this._options["scrollToBottom"]) {
pm = this.runningCheck(() => scrollToBottom(tab));
}
return pm;
}
private runningCheck(fn: () => Promise<any>): Promise<any> {
if (!this._running) throw "The task is stopped by user.";
return fn();
}
private saveResult(results, key) {
if (this._data[key] === undefined) {
// do not add keys again
this._data_keys.push(key);
}
this._data[key] = results;
logger.info(`${results.length} items found.`)
}
}

62
src/background/tools.ts Normal file
View File

@ -0,0 +1,62 @@
import { ExtractResult } from "./result";
const URL_REG = /^\s*(https?):\/\//im;
export function parseUrls(...args): string[] {
if (!args.length) return [];
let arg = args.shift();
if (arg instanceof Array) {
return arg;
} else if (arg instanceof ExtractResult) {
return arg.squash().filter(v => URL_REG.test(v));
} else {
let urlTempl = arg;
if (urlTempl) {
if (args[0] instanceof Array) {
return args[0].map(p => urlTempl.replace("${page}", p));
} else if (args.length >= 3) {
let urls = [];
let from = args.shift();
let to = args.shift();
let interval = args.shift();
for (let i = from; i <= to; i += interval) {
urls.push(urlTempl.replace("${page}", i));
}
return urls;
}
}
}
return [];
}
export function saveFile(data: string, mimeType: string, fileName?: string) {
fileName = fileName || document.title || "result";
let blob: Blob;
if (typeof window.Blob == "function") {
blob = new Blob([data], {
type: mimeType
})
} else {
var BlobBuiler = window.MSBlobBuilder;
var builer = new BlobBuiler();
builer.append(data);
blob = builer.getBlob(mimeType)
}
var URL = window.URL || window.webkitURL;
var url = URL.createObjectURL(blob);
var link = document.createElement("a");
if ('download' in link) {
link.style.visibility = "hidden";
link.href = url;
link.download = fileName;
document.body.appendChild(link);
var j = document.createEvent("MouseEvents");
j.initEvent("click", true, true);
link.dispatchEvent(j);
document.body.removeChild(link)
} else if (navigator.msSaveBlob) {
navigator.msSaveBlob(blob, fileName)
} else {
location.href = url
}
}

28
src/common/index.ts Normal file
View File

@ -0,0 +1,28 @@
export enum Actions {
// from background to content script
EXTRACT = 1,
GOTO_URL,
PING,
QUERY_URL,
SCROLL_BOTTOM,
SLEEP,
WAKEUP,
// from popup to background script
UPLOAD_STATE,
// from content to background script
REPORT_NEW_PAGE,
}
export interface Request {
action: Actions
itemsSelector?: string
fieldSelectors?: string[]
url?: string
fileName?: string
state?: string
}
export interface Response<T> {
result: T;
error: string;
}

75
src/common/logger.ts Normal file
View File

@ -0,0 +1,75 @@
export enum LOGGER_LEVEL {
DEBUG = 1,
INFO,
WARN,
ERROR,
DISABLED,
};
export class Logger {
private _notificationId = undefined;
private _log_level = LOGGER_LEVEL.INFO;
private _notify_level = LOGGER_LEVEL.ERROR;
constructor(logLevel, notifyLevel) {
if (logLevel) this._log_level = logLevel;
if (notifyLevel) this._notify_level = notifyLevel;
if (chrome.notifications) chrome.notifications.onClosed.addListener((id, byUser) => { this._notify_level = undefined });
}
get logLevel() {
return this._log_level;
}
set logLevel(val: LOGGER_LEVEL) {
this._log_level = val;
}
get notifyLevel() {
return this._notify_level;
}
set notifyLevel(val: LOGGER_LEVEL) {
this._notify_level = val;
}
log(level: LOGGER_LEVEL, loggerFn: Function, ...msgs) {
if (level < this._log_level) return;
let time = new Date().toLocaleString();
loggerFn(`${time} [${LOGGER_LEVEL[level]}]`, ...msgs);
if (level < this._notify_level) return;
this.notify(...msgs);
}
debug(...msgs) {
this.log(LOGGER_LEVEL.DEBUG, console.debug, ...msgs);
}
info(...msgs) {
this.log(LOGGER_LEVEL.INFO, console.info, ...msgs);
}
warn(...msgs) {
this.log(LOGGER_LEVEL.WARN, console.info, ...msgs);
}
error(...msgs) {
this.log(LOGGER_LEVEL.ERROR, console.info, ...msgs);
}
notify(...msgs) {
let msg = msgs.join(' ');
if (!this._notificationId) {
chrome.notifications.create(
null,
{
"type": "basic",
"iconUrl": chrome.extension.getURL('icon.png'),
"title": "Data Extractor",
"message": msg,
"priority": 0,
"requireInteraction": true,
},
notificationId => {
this._notificationId = notificationId;
}
);
return;
}
chrome.notifications.update(
this._notificationId,
{ "message": msg }
);
}
}
export const logger = new Logger(LOGGER_LEVEL.DEBUG, LOGGER_LEVEL.DISABLED);

101
src/content/actions.ts Normal file
View File

@ -0,0 +1,101 @@
import { logger } from "../common/logger";
export function extract(itemsSelector: string, fieldSelectors: string[], expectedURL: string): string[][] {
if (expectedURL && location.href != expectedURL) {
throw 'Target tab URL changed, aborting...';
}
// since some elements may be loaded asynchronously.
// if one field is never found, we should return undefined,
// so that senders can detect to retry until elements loaded.
// If user writes wrong selectors, the task retries infinitely.
let fieldFound: { [key: string]: boolean } = {};
let items: Element[] = Array.from(document.querySelectorAll(itemsSelector));
// items may not loaded yet, tell the sender to retry.
if (!items.length) return [];
let results: string[][] = items.map(
item => {
return fieldSelectors.map(
selector => {
let doClick = false;
if (selector.startsWith("!")) {
doClick = true;
selector = selector.substring(1);
}
let [cls, attr] = selector.split('@').slice(0, 2);
let fieldElements: Element[];
cls = cls.trim()
if (cls != "") {
fieldElements = Array.from(item.querySelectorAll(cls));
} else {
fieldElements = [item];
}
if (!fieldElements.length) {
return;
}
fieldFound[selector] = true;
return fieldElements.map(find => {
if (doClick) {
let e = document.createEvent("MouseEvents");
e.initEvent("click", true, true);
find.dispatchEvent(e);
}
return attr ? find[attr] : find.textContent.trim();
}).join('\n')
}
)
}
);
// TODO: configurable wait logic
// if it exists a field, which is not found in any row, the sender should retry.
let notFoundFields = fieldSelectors.filter(f => !fieldFound[f]);
let shouldWait = notFoundFields.length > 0;
if (shouldWait) {
logger.debug('should wait for:', fieldSelectors.filter(f => !fieldFound[f]).join(','));
}
return shouldWait ? [] : results;
}
export function scrollToBottom() {
return executeUntil(
() => window.scrollTo(0, document.body.clientHeight),
() => document.body.clientHeight - window.scrollY - window.innerHeight < 20,
"Scroll to page bottom...",
1000,
10
);
}
/**
* Repeatedly execute an function until the the detector returns true.
* @param {object} fn the function to execute
* @param {object} detector the detector.
* @param {string} log messages logged to console.
* @param {number} interval interval for detecting
* @param {number} limit max execute times of a function
* @return {Promise} a promise of the response.
*/
function executeUntil(fn: () => void, detector: () => boolean, log: string, interval: number, limit: number) {
interval = interval || 500;
let count = 0;
return new Promise<boolean>((resolve, reject) => {
loop();
async function loop() {
fn();
limit++;
if (limit && count >= limit) {
reject(false);
}
setTimeout(() => {
let flag = !detector || detector();
if (log) console.log(log, flag ? '(OK)' : '(failed)');
if (flag) {
resolve(true);
} else {
loop();
}
}, interval);
}
});
}

75
src/content/index.ts Normal file
View File

@ -0,0 +1,75 @@
import { Request, Actions, Response } from '../common';
import { scrollToBottom, extract } from './actions';
let asleep = false;
chrome.runtime.onMessage.addListener(
function (request, sender: chrome.runtime.MessageSender, sendResponse: (r: any) => void) {
if (!request.action) return;
if (asleep && Actions.WAKEUP != request.action) {
sendResponse && sendResponse(undefined);
return;
}
// console.log("Recieved request:",request);
doAction(request, sender).then(r => sendResponse && sendResponse(r));
// return true to indicate you wish to send a response asynchronously
return true;
}
);
chrome.runtime.sendMessage(<Request>{
action: Actions.REPORT_NEW_PAGE,
});
async function doAction(request: Request, sender: chrome.runtime.MessageSender): Promise<Response<any>> {
let result: any;
let error: string;
try {
switch (request.action) {
case Actions.EXTRACT:
result = extract(request.itemsSelector, request.fieldSelectors, request.url);
break;
case Actions.GOTO_URL:
window.location.replace(request.url);
// should not recieve any request until the page & script reload
asleep = true;
result = request.url;
break;
case Actions.PING:
result = "pong";
break;
case Actions.QUERY_URL:
result = window.location.href;
break;
case Actions.SCROLL_BOTTOM:
result = scrollToBottom();
break;
case Actions.SLEEP:
asleep = true;
result = "Content script is sleeping.";
break;
case Actions.WAKEUP:
asleep = false;
result = "Content script is available.";
break;
default:
error = 'Unsupported action.'
break;
}
} catch (err) {
if (err instanceof Error) {
error = err.message;
} else {
error = err;
}
}
return newResponse(result, error);
}
function newResponse<T>(result: T, err?: string): Response<T> {
let r: Response<T> = {
result: result,
error: err,
}
return r;
}

34
src/popup/index.ts Normal file
View File

@ -0,0 +1,34 @@
import { Request, Actions } from '../common';
window.onload = function () {
document.querySelector('#link-extension-detail')
.addEventListener('click', () => {
chrome.tabs.create({
'url': `chrome://extensions/?id=${chrome.runtime.id}`
});
})
document.querySelector('#link-document')
.addEventListener('click', () => {
chrome.tabs.create({
'url': `https://git.qjebbs.com/jebbs/data-extracter-extesion`
});
})
document.querySelector('#state-input')
.addEventListener('change', function (...args) {
if (this.files.length == 1) {
var reader = new FileReader();
let fileName = this.files[0].name;
reader.readAsText(this.files[0], "UTF-8");
reader.onload = function (evt) {
var fileString = evt.target.result;
chrome.runtime.sendMessage(<Request>{
action: Actions.UPLOAD_STATE,
state: fileString,
fileName: fileName
}, r => {
if (r) console.log('State sent:', r);
});
}
}
});
}

File diff suppressed because one or more lines are too long

View File

Before

Width:  |  Height:  |  Size: 36 KiB

After

Width:  |  Height:  |  Size: 36 KiB

View File

@ -3,11 +3,9 @@
<link>
<meta charset="utf-8">
<title>Data Extractor</title>
<script charset="UTF-8" type="text/javascript" src="../scripts/jquery.min.js"></script>
<script charset="UTF-8" type="text/javascript" src="../styles/bootstrap.min.js"></script>
<script charset="UTF-8" type="text/javascript" src="./tip.js"></script>
<script charset="UTF-8" type="text/javascript" src="../scripts/popup.bundle.js"></script>
<link rel="stylesheet" href="../styles/bootstrap.min.css">
<link rel="stylesheet" href="../assets/bootstrap.min.css">
</head>
<body style="margin: 20px 10px;">
@ -20,20 +18,19 @@
<div class="row">
<div class="col">
<div class="alert alert-info small">
<!-- <h6>Usage:</h6> -->
<p>
Goto <a href="#" id="link-extension-detail">Extension Detail</a>, click "backgroud page",
and type your scripts in the console.
</p>
<p>
<img src="../images/console.png" alt="" style="max-width: 489px; width: 100%; border-radius: 5px">
<img src="../assets/console.png" alt=""
style="max-width: 489px; width: 100%; border-radius: 5px">
</p>
</div>
</div>
</div>
<div class="row">
<div class="col">
<h6>Quick Start</h6>
</div>
@ -43,22 +40,33 @@
<div class="alert alert-success small">
<p>
<b>Extract current page</b>:
<br>new Extractor().task(".list-item", ["a.title", "p.content"]).start();
<br>&gt; $(".list-item", ["a.title", "p.content"]);
</p>
<p>
<b>Extract multiple pages (1-10, interval 1)</b>:
<br>new Extractor().task(".list-item", ["a.title", "p.content"],
"http://sample.com/?pn=${page}", 1, 10, 1).start();
<br>&gt; job=new Extractor().task(".list-item", ["a.title", "p.content"],
"http://sample.com/?pn=${page}", 1, 10, 1);
<br>&gt; job.start();
</p>
<p>
<b>Full document:</b>
<b>Full document at:</b>
<br>
<a href="#" id="link-document">https://git.jebbs.co/jebbs/data-extracter-extesion</a>
<a href="#" id="link-document">https://git.qjebbs.com/jebbs/data-extracter-extesion</a>
</p>
</div>
</div>
</div>
<div class="row">
<div class="col">
<h6>Saved State</h6>
</div>
</div>
<div class="row">
<div class="col">
<input type="file" name="state" id="state-input">
</div>
</div>
</div>
</body>

View File

Before

Width:  |  Height:  |  Size: 4.1 KiB

After

Width:  |  Height:  |  Size: 4.1 KiB

View File

@ -1,7 +1,7 @@
{
"manifest_version": 2,
"name": "Data Extracter",
"version": "0.1.0",
"version": "0.5.1",
"author": "jebbs",
"description": "Extract data from web page elements as sheet.",
"icons": {
@ -11,29 +11,25 @@
},
"browser_action": {
"default_icon": "icon.png",
"default_popup": "popup/tip.html",
"default_popup": "html/popup.html",
"default_title": "Data Extracter"
},
"background": {
"scripts": [
"scripts/background.js",
"scripts/result.js",
"scripts/tools.js",
"scripts/extract.js",
"scripts/extractor.js"
"scripts/background.bundle.js"
],
"persistent": false
},
"content_scripts": [{
"matches": ["*://*/*"],
"js": [
"scripts/jquery.min.js",
"scripts/content.js"
"scripts/content.bundle.js"
],
"run_at": "document_idle"
}],
"incognito": "spanning",
"permissions": [
"activeTab",
"storage"
"notifications"
]
}

12
tsconfig.json Normal file
View File

@ -0,0 +1,12 @@
{
"compilerOptions": {
"module": "commonjs",
"target": "es6",
"noImplicitAny": false,
"sourceMap": true,
"rootDir": "src",
"outDir": "dist/js",
"noEmitOnError": true,
"typeRoots": [ "node_modules/@types" ]
}
}

33
webpack.config.js Normal file
View File

@ -0,0 +1,33 @@
const path = require('path');
const CopyPlugin = require('copy-webpack-plugin');
module.exports = {
mode: 'production',
entry: {
background: './src/background/index.ts',
content: './src/content/index.ts',
popup: './src/popup/index.ts',
},
// devtool: 'inline-source-map',
output: {
path: path.resolve(__dirname, 'dist'),
filename: 'scripts/[name].bundle.js'
},
module: {
rules: [
{
test: /\.tsx?$/,
use: 'ts-loader',
exclude: /node_modules/
}
]
},
resolve: {
extensions: ['.tsx', '.ts', '.js']
},
plugins: [
new CopyPlugin([
{ from: '**/*', to: '.', toType: "dir" },
], { context: 'template', logLevel: 'warn' }),
]
};