diff --git a/manifest.json b/manifest.json index bab72c9..510af17 100755 --- a/manifest.json +++ b/manifest.json @@ -22,6 +22,7 @@ "scripts/background/result.js", "scripts/background/signiture.js", "scripts/background/actions.js", + "scripts/background/task.js", "scripts/background/extractor.js", "scripts/background/helpers.js" ], diff --git a/readme.md b/readme.md index 83aceee..ce3f782 100644 --- a/readme.md +++ b/readme.md @@ -50,17 +50,14 @@ function (itemsSelector:string, fieldSelectors:string[], urls:string[]) function (itemsSelector:string, fieldSelectors:string[], urls:ExtractResult) ``` -## Advanced Usage +## Stop Tasks -### Stop Tasks +The only way to stop tasks before its finish, is `Closing the target tab`. -Tasks wait for their target elements' appearance, given some elements were loaded asynchronously. +> Tasks wait for their target elements' appearance, given some elements were loaded asynchronously. +> If you typed wrong selectors, the task waits forever for elements which don't exists. -But if you typed wrong selectors, the task waits forever for elements which don't exists. - -The only way to stop tasks before its finish, is `Closing the host tab`. - -### Extract Attributes. +## Extract Attributes. e.g.: link text and target (use 'selector@attribute') @@ -68,20 +65,43 @@ e.g.: link text and target (use 'selector@attribute') new Extractor().task('.item', ['a', 'a@href']).start(); ``` +## Advanced Usage + ### Use Task Chain. e.g.: Collect links from `http://sample.com/abc`, then, Extract data of each link ```js -new Extractor() - .task('.search-list-item', ['a@href'], ["http://sample.com/abc"]) +e = new Extractor() +e.task('.search-list-item', ['a@href'], ["http://sample.com/abc"]) .task('list-item', ["a.title", "p.content"]) .start(); ``` +### Continue Tasks + +You can always continue tasks (with following), even it stops in the middle of a task: + +```js +e.start() +``` + +The `Extractor` kept the state of last execution, and starts from where it stopped. + +### Restart Tasks + +What should I do, if I don't like to continue from last state, but restart from certain task? + +```js +// restart all tasks +e.restart(0) +// restart from 2nd task +e.restart(1) +``` + ### Save Result of Any Task -To a multiple task (chain) Extractor `e`: +To a multiple task Extractor `e`: ```js e = new Extractor() @@ -98,37 +118,12 @@ Incase you want to save it again, use: e.save() ``` -You may want to save another task's result, other than the final: +To save another task result, other than the final one: ```js // save the result of first task // to the example above, that is a list of urls +e.save(0) +// save the result of second task e.save(1) ``` - -### Restart Tasks - -In cases some later task fails, you don't need to restart all task. - -Here we have 2 tasks: - -```js -e = new Extractor() -e.task('.search-list-item', ['a@href'], ["http://sample.com/abc"]) - .task('list-item', ["a.title", "p.content"]) - .start(); -``` - -Suppose the second task fails, we can restart and continue from the task 2: - -```js -e.restart(2); -``` - -If you'd like restart all task, use: - -```js -e.start(); -// or -e.restart(); -``` \ No newline at end of file diff --git a/scripts/background/actions.js b/scripts/background/actions.js index 3dcbb5e..6a776f5 100644 --- a/scripts/background/actions.js +++ b/scripts/background/actions.js @@ -55,7 +55,7 @@ function parseUrls(...args) { if (arg instanceof Array) { return arg; } else if (arg instanceof ExtractResult) { - return arg.squash().filter(v => !!v); + return arg.squash().filter(v => URL_REG.test(v)); } else { let urlTempl = arg; if (urlTempl) { diff --git a/scripts/background/extractor.js b/scripts/background/extractor.js index 24ae4eb..eba40da 100644 --- a/scripts/background/extractor.js +++ b/scripts/background/extractor.js @@ -1,41 +1,44 @@ class Extractor { constructor() { this._tasks = []; - this._tab = undefined; this._running = false; - this._results = {}; } /** * Add a task to Extractor. \n * One Extractor could has multiple tasks, which orgnized in a task chian. - * Later task will use previous task result as input (target url list). - * So only the first task can have target url arguments, while later tasks can't. + * If url arguments not given within later tasks, they will use previous task result as input (target url list). * @param {...any} args itemsSelector, fieldSelectors, and more args to specify target urls. */ task(...args) { - if (!testArgs(...args)) { - console.log(`Invalid task arguments: ${argsToString(...args)}\n\n${signitures}\n`); - return this; - } - // given >2 arguments means the task specifies target page, - // so it won't accept last task result as url list. - // in this case, former tasks are useless, can be cleared. - if (args.length > 2) this.clear(); - this._tasks.push(args); + this._tasks.push(new Task(...args)); return this; } /** - * Clear tasks and caches. + * Clear tasks and task caches. */ clear() { this._tasks = []; - this._results = []; return this; } /** * Start the task chain. */ async start() { + return this._startTasks(0); + } + /** + * restart from specified task, but don't restart the previous tasks. + * @param {number} from where to restart the tasks, begins with 0 + */ + async restart(from = 0) { + let id = this._checkTaskId(from, 0); + if (!id) return; + for (let i = id; i < this._tasks.length; i++) { + this._tasks[i].clean(); + } + return this._startTasks(0); + } + async _startTasks(from) { if (this._running) { console.log('The Extractor is running. Please wait..'); return; @@ -44,68 +47,28 @@ class Extractor { console.log('No task to run.'); return; } - let firstTaskArgs = this._tasks[0]; - if (firstTaskArgs.length > 2) { + + let tab; + let task = this._tasks[0]; + if (task.urls.length) { // task specifies target urls, create new tab with first url for it - let urls = parseUrls(...firstTaskArgs.slice(2, firstTaskArgs.length)); - this._tab = await createTab(urls[0], false); + tab = await createTab(task.urls[0], false); } else { - this._tab = await getActiveTab(false); + tab = await getActiveTab(true) || await getActiveTab(false); } this._running = true; - return this._tasks.reduce((pms, args, i, tasks) => { + return this._tasks.reduce((pms, task, i) => { return pms.then( - result => { - if (result === undefined) return getData(this._tab, ...args); - this._results[tasks[i - 1]] = result; - return getData(this._tab, ...args, result); + () => { + if (i < from) return; + if (i > 0) { + let prevTask = this._tasks[i - 1]; + return task.execute(tab, new ExtractResult(prevTask.results)); + } + return task.execute(tab, undefined); }); }, Promise.resolve(undefined)).then( - result => { - this._results[this._tasks[this._tasks.length - 1]] = result; - this._running = false; - console.log("Tasks are all done.") - this.save(); - } - ).catch(err => { - this._running = false; - console.log(err) - }); - } - /** - * restart from specified task, but don't restart the previous tasks. - * @param {number} taskid from which restart the tasks - */ - async restart(taskid) { - if (this._running) { - console.log('The Extractor is running. Please wait..'); - return; - } - taskid = this._checkTaskId(taskid, 1); - if (!taskid) return; - if (taskid == 1) { - this.start(); - return; - } - let cache = this._results[this._tasks[taskid - 2]]; - if (!cache) { - console.log(`No result cache for task (id ${taskid}). \nMake sure call ".start()" before ".restart()"?`); - return; - } - this._running = true; - this._tab = await createTab(parseUrls(cache)[0], false) - return this._tasks.slice(taskid - 1).reduce((pms, args, i, tasks) => { - return pms.then( - result => { - this._results[tasks[i - 1]] = result; - return getData(this._tab, ...args, result); - }); - }, Promise.resolve(cache)).then( - result => { - this._results[this._tasks[this._tasks.length - 1]] = result; - this._running = false; - this.save(); - } + () => this.save() ).catch(err => { this._running = false; console.log(err) @@ -113,18 +76,15 @@ class Extractor { } /** * Save result of a task - * @param {number} taskid which task id to save. + * @param {number} taskid which task id to save, begins with 0 */ save(taskid) { - taskid = this._checkTaskId(taskid, this._tasks.length); - if (!taskid) return; - const result = this._results[this._tasks[taskid - 1]]; - if (!result) { - console.log(`No result for task #${taskid}. Forget to call ".start()"?`); - return; - } - if (result.data.length <= 1) { // 1 for selector headers - console.log(`No result for task #${taskid}. Forget to call ".start()"?`); + let id = this._checkTaskId(taskid, this._tasks.length - 1); + if (!id) return; + let result = new ExtractResult(this._tasks[id].results); + + if (!result.data.length) { + console.log(`No result for task #${id}. Forget to call ".start()"?`); return; } let msg = ` @@ -141,9 +101,9 @@ ${result.toString(50) || "- Empty -"} console.log("No task found."); return 0; } - if (defaultId && id === undefined || this.task === null) id = defaultId; - if (isNaN(id) || id < 1 || id > this._tasks.length) { - console.log(`Invalid task id. Rang(1-${this._tasks.length})`); + if (defaultId && id === undefined) id = defaultId; + if (isNaN(id) || id < 0 || id >= this._tasks.length) { + console.log(`Invalid task id. Rang(0-${this._tasks.length - 1})`); return 0; } return id diff --git a/scripts/background/task.js b/scripts/background/task.js new file mode 100644 index 0000000..b66feb7 --- /dev/null +++ b/scripts/background/task.js @@ -0,0 +1,66 @@ +class Task { + // _manager = undefined; + // _id = 0; + // _urls = []; + _data = {}; + /** + * Create a task. + * constructor(itemsSelector:string, fieldSelectors:string[]) + * constructor(itemsSelector:string, fieldSelectors:string[], url:string, from:number, to:number, interval:number) + * constructor(itemsSelector:string, fieldSelectors:string[], url:string, pages:number[]) + * constructor(itemsSelector:string, fieldSelectors:string[], urls:string[]) + * @param {...any} args + */ + constructor(...args) { + if (!testArgs(...args)) + throw new Error(`Invalid call arguments.\n\n${signitures}\n\n`); + this._itemsSelector = args.shift(); + this._fieldSelectors = args.shift(); + this._urls = parseUrls(...args); + } + get urls() { + return this._urls; + } + get data() { + return this._data; + } + get results() { + return this._urls.reduce((p, c) => { + return p.concat(this._data[c]); + }, []); + } + clean() { + this._data = {}; + } + async execute(tab, upstreamData) { + if (!tab) throw new Error("No tab to execute the task."); + if (!this._urls.length) { + if (upstreamData) { + this._urls = parseUrls(upstreamData); + } else { + this._urls = [await queryUrl(tab)]; + } + } + return this._urls.reduce((p, url, i) => p.then( + results => { + if (i > 0) { + if (!MSG_URL_SKIPPED.isEqual(results)) { + let lastURL = this._urls[i - 1]; + this._data[lastURL] = results; + } + } + return this._data[url] ? MSG_URL_SKIPPED : redirectTab(tab, url).then( + () => extractTabData(tab, this._itemsSelector, this._fieldSelectors) + ); + } + ), Promise.resolve(null)).then( + results => { + if (!MSG_URL_SKIPPED.isEqual(results)) { + let lastURL = this._urls[this._urls.length - 1]; + this._data[lastURL] = results; + return; + } + } + ); + } +} \ No newline at end of file diff --git a/scripts/shared/common.js b/scripts/shared/common.js index cd0b5d8..fc20a5b 100644 --- a/scripts/shared/common.js +++ b/scripts/shared/common.js @@ -1,8 +1,11 @@ const EXT_NAME = "DataExtracter"; +const URL_REG = getWebUrl(); + const ACTION_EXTRACT = `${EXT_NAME}:Extract`; const ACTION_GOTO_URL = `${EXT_NAME}:GoToTUL`; const ACTION_REPORT_IN = `${EXT_NAME}:ReportIn`; const ACTION_QUERY_URL = `${EXT_NAME}:QueryURL`; const MSG_ELEMENT_NOT_FOUND = new ConstMessage(1, "No element found for at least one selector, maybe it's not loaded yet"); +const MSG_URL_SKIPPED = new ConstMessage(100, "Skipped current URL"); diff --git a/scripts/shared/tools.js b/scripts/shared/tools.js index 30cb6eb..640fb65 100644 --- a/scripts/shared/tools.js +++ b/scripts/shared/tools.js @@ -40,3 +40,25 @@ function saveFile(data, mimeType, fileName) { location.href = url } } + +function getWebUrl() { + let engIriChar = "0-9a-zA-z"; + let goodIriChar = "0-9a-zA-z"; + let topLevelDomainStrForWebUrlExpand = "(?:com|net|org|gov|mil|edu|biz|info|pro|name|coop|travel|xxx|idv|aero|museum|mobi|asia|tel|int|post|jobs|cat|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cu|cv|cx|cy|cz|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|sk|sm|sn|so|sr|st|sv|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|um|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|yr|za|zm|zw|accountant|club|coach|college|company|construction|consulting|contractors|cooking|corp|credit|creditcard|dance|dealer|democrat|dental|dentist|design|diamonds|direct|doctor|drive|eco|education|energy|engineer|engineering|equipment|events|exchange|expert|express|faith|farm|farmers|fashion|finance|financial|fish|fit|fitness|flights|florist|flowers|food|football|forsale|furniture|game|games|garden|gmbh|golf|health|healthcare|hockey|holdings|holiday|home|hospital|hotel|hotels|house|inc|industries|insurance|insure|investments|islam|jewelry|justforu|kid|kids|law|lawyer|legal|lighting|limited|live|llc|llp|loft|ltd|ltda|managment|marketing|media|medical|men|money|mortgage|moto|motorcycles|music|mutualfunds|ngo|partners|party|pharmacy|photo|photography|photos|physio|pizza|plumbing|press|prod|productions|radio|rehab|rent|repair|report|republican|restaurant|room|rugby|safe|sale|sarl|save|school|secure|security|services|shoes|show|soccer|spa|sport|sports|spot|srl|storage|studio|tattoo|taxi|team|tech|technology|thai|tips|tour|tours|toys|trade|trading|travelers|university|vacations|ventures|versicherung|versicherung|vet|wedding|wine|winners|work|works|yachts|zone|archi|architect|casa|contruction|estate|haus|house|immo|immobilien|lighting|loft|mls|realty|academy|arab|bible|care|catholic|charity|christmas|church|college|community|contact|degree|education|faith|foundation|gay|halal|hiv|indiands|institute|irish|islam|kiwi|latino|mba|meet|memorial|ngo|phd|prof|school|schule|science|singles|social|swiss|thai|trust|university|uno|auction|best|bid|boutique|center|cheap|compare|coupon|coupons|deal|deals|diamonds|discount|fashion|forsale|free|gift|gold|gratis|hot|jewelry|kaufen|luxe|luxury|market|moda|pay|promo|qpon|review|reviews|rocks|sale|shoes|shop|shopping|store|tienda|top|toys|watch|zero|bar|bio|cafe|catering|coffee|cooking|diet|eat|food|kitchen|menu|organic|pizza|pub|rest|restaurant|vodka|wine|abudhabi|africa|alsace|amsterdam|barcelona|bayern|berlin|boats|booking|boston|brussels|budapest|caravan|casa|catalonia|city|club|cologne|corsica|country|cruise|cruises|deal|deals|doha|dubai|durban|earth|flights|fly|fun|gent|guide|hamburg|helsinki|holiday|hotel|hoteles|hotels|ist|istanbul|joburg|koeln|land|london|madrid|map|melbourne|miami|moscow|nagoya|nrw|nyc|osaka|paris|party|persiangulf|place|quebec|reise|reisen|rio|roma|room|ruhr|saarland|stockholm|swiss|sydney|taipei|tickets|tirol|tokyo|tour|tours|town|travelers|vacations|vegas|wales|wien|world|yokohama|zuerich|art|auto|autos|baby|band|baseball|beats|beauty|beknown|bike|book|boutique|broadway|car|cars|club|coach|contact|cool|cricket|dad|dance|date|dating|design|dog|events|family|fan|fans|fashion|film|final|fishing|football|fun|furniture|futbol|gallery|game|games|garden|gay|golf|guru|hair|hiphop|hockey|home|horse|icu|joy|kid|kids|life|lifestyle|like|living|lol|makeup|meet|men|moda|moi|mom|movie|movistar|music|party|pet|pets|photo|photography|photos|pics|pictures|play|poker|rodeo|rugby|run|salon|singles|ski|skin|smile|soccer|social|song|soy|sport|sports|star|style|surf|tatoo|tennis|theater|theatre|tunes|vip|wed|wedding|win|winners|yoga|you|analytics|antivirus|app|blog|call|camera|channel|chat|click|cloud|computer|contact|data|dev|digital|direct|docs|domains|dot|download|email|foo|forum|graphics|guide|help|home|host|hosting|idn|link|lol|mail|mobile|network|online|open|page|phone|pin|search|site|software|webcam|airforce|army|black|blue|box|buzz|casa|cool|day|discover|donuts|exposed|fast|finish|fire|fyi|global|green|help|here|how|international|ira|jetzt|jot|like|live|kim|navy|new|news|next|ninja|now|one|ooo|pink|plus|red|solar|tips|today|weather|wow|wtf|xyz|abogado|adult|anquan|aquitaine|attorney|audible|autoinsurance|banque|bargains|bcn|beer|bet|bingo|blackfriday|bom|boo|bot|broker|builders|business|bzh|cab|cal|cam|camp|cancerresearch|capetown|carinsurance|casino|ceo|cfp|circle|claims|cleaning|clothing|codes|condos|connectors|courses|cpa|cymru|dds|delivery|desi|directory|diy|dvr|ecom|enterprises|esq|eus|fail|feedback|financialaid|frontdoor|fund|gal|gifts|gives|giving|glass|gop|got|gripe|grocery|group|guitars|hangout|homegoods|homes|homesense|hotels|ing|ink|juegos|kinder|kosher|kyoto|lat|lease|lgbt|liason|loan|loans|locker|lotto|love|maison|markets|matrix|meme|mov|okinawa|ong|onl|origins|parts|patch|pid|ping|porn|progressive|properties|property|protection|racing|read|realestate|realtor|recipes|rentals|sex|sexy|shopyourway|shouji|silk|solutions|stroke|study|sucks|supplies|supply|tax|tires|total|training|translations|travelersinsurcance|ventures|viajes|villas|vin|vivo|voyage|vuelos|wang|watches|测试|集团|在线|公益|公司|移动|我爱你|商标|商城|中文网|中信|中国|中國|測試|网络|香港|台湾|台灣|机构|组织机构|世界|网址|游戏|新加坡|政务|परीक्षा|한국|ভারত|موقع|বাংলা|москва|испытание|қаз|онлайн|сайт|срб|테스트|орг|삼성|சிங்கப்பூர்|дети|мкд|טעסט|భారత్|ලංකා|ભારત|भारत|آزمایشی|பரிட்சை|संगठन|укр|δοκιμή|إختبار|мон|الجزائر|عمان|ایران|امارات|بازار|پاکستان|الاردن|بھارت|المغرب|السعودية|سودان|مليسيا|شبكة|გე|ไทย|سورية|рф|تونس|みんな|ਭਾਰਤ|مصر|قطر|இலங்கை|இந்தியா|فلسطين|テスト)\\b"; // http://www.ip138.com/yuming/ + return new RegExp("((?:(http|https|Http|Https|rtsp|Rtsp):\\/\\/(?:(?:[a-zA-Z0-9\\$\\-\\_\\.\\+\\!\\*\\'\\(\\)" + + "\\,\\;\\?\\&\\=]|(?:\\%[a-fA-F0-9]{2})){1,64}(?:\\:(?:[a-zA-Z0-9\\$\\-\\_" + + "\\.\\+\\!\\*\\'\\(\\)\\,\\;\\?\\&\\=]|(?:\\%[a-fA-F0-9]{2})){1,25})?\\@)?)?" + + "((?:(?:[" + engIriChar + "][" + engIriChar + "\\-]{0,64}\\.)+" // named host + + topLevelDomainStrForWebUrlExpand + + "|(?:(?:25[0-5]|2[0-4]" // or ip address + + "[0-9]|[0-1][0-9]{2}|[1-9][0-9]|[1-9])\\.(?:25[0-5]|2[0-4][0-9]" + + "|[0-1][0-9]{2}|[1-9][0-9]|[1-9]|0)\\.(?:25[0-5]|2[0-4][0-9]|[0-1]" + + "[0-9]{2}|[1-9][0-9]|[1-9]|0)\\.(?:25[0-5]|2[0-4][0-9]|[0-1][0-9]{2}" + + "|[1-9][0-9]|[0-9])))" + + "(?:\\:\\d{1,5})?)" // plus option port number + + "(\\/(?:(?:[" + goodIriChar + "\\;\\/\\?\\:\\@\\&\\=\\#\\~" // plus option query params + + "\\-\\.\\+\\!\\*\\'\\(\\)\\,\\_])|(?:\\%[a-fA-F0-9]{2}))*)?" + + "(?:\\b|$))", "m"); // and finally, a word boundary or end of + // input. This is to stop foo.sure from + // matching as foo.su +} \ No newline at end of file