Updated option max crawled places

This commit is contained in:
JakubDrobnik 2018-12-10 16:13:30 +01:00
parent c8a232c93f
commit 93719c924f
2 changed files with 10 additions and 6 deletions

View File

@ -7,7 +7,7 @@ const waitForGoogleMapLoader = (page) => page.waitFor(() => !document.querySelec
.classList .classList
.contains('loading'), { timeout: DEFAULT_TIMEOUT }); .contains('loading'), { timeout: DEFAULT_TIMEOUT });
const enqueueAllUrlsFromPagination = async (page, requestQueue) => { const enqueueAllUrlsFromPagination = async (page, requestQueue, paginationFrom, maxPlacesPerCrawl) => {
let results = await page.$$('.section-result'); let results = await page.$$('.section-result');
const resultsCount = results.length; const resultsCount = results.length;
for (let resultIndex = 0; resultIndex < resultsCount; resultIndex++) { for (let resultIndex = 0; resultIndex < resultsCount; resultIndex++) {
@ -25,6 +25,10 @@ const enqueueAllUrlsFromPagination = async (page, requestQueue) => {
const url = page.url(); const url = page.url();
await requestQueue.addRequest({ url, userData: { label: 'detail' } }); await requestQueue.addRequest({ url, userData: { label: 'detail' } });
console.log(`Added to queue ${url}`); console.log(`Added to queue ${url}`);
if (maxPlacesPerCrawl && paginationFrom + resultIndex + 1 > maxPlacesPerCrawl) {
console.log(`Reach max places per crawl ${maxPlacesPerCrawl}, stopped enqueuing new places.`);
break;
}
await page.click('.section-back-to-list-button'); await page.click('.section-back-to-list-button');
} }
}; };
@ -36,9 +40,9 @@ const enqueueAllUrlsFromPagination = async (page, requestQueue) => {
* @param launchPuppeteerOptions * @param launchPuppeteerOptions
* @param requestQueue * @param requestQueue
* @param listingPagination * @param listingPagination
* @param maxRequestsPerCrawl * @param maxPlacesPerCrawl
*/ */
const enqueueAllPlaceDetailsCrawler = async (page, searchString, launchPuppeteerOptions, requestQueue, listingPagination, maxRequestsPerCrawl) => { const enqueueAllPlaceDetailsCrawler = async (page, searchString, launchPuppeteerOptions, requestQueue, listingPagination, maxPlacesPerCrawl) => {
await page.type('#searchboxinput', searchString); await page.type('#searchboxinput', searchString);
await sleep(5000); await sleep(5000);
await page.click('#searchbox-searchbutton'); await page.click('#searchbox-searchbutton');
@ -67,7 +71,7 @@ const enqueueAllPlaceDetailsCrawler = async (page, searchString, launchPuppeteer
console.log(`Skiped pagination ${from} - ${to}, already done!`); console.log(`Skiped pagination ${from} - ${to}, already done!`);
} else { } else {
console.log(`Added links from pagination ${from} - ${to}`); console.log(`Added links from pagination ${from} - ${to}`);
await enqueueAllUrlsFromPagination(page, requestQueue); await enqueueAllUrlsFromPagination(page, requestQueue, from, maxPlacesPerCrawl);
listingPagination = { from, to }; listingPagination = { from, to };
await Apify.setValue(LISTING_PAGINATION_KEY, listingPagination); await Apify.setValue(LISTING_PAGINATION_KEY, listingPagination);
} }
@ -77,7 +81,7 @@ const enqueueAllPlaceDetailsCrawler = async (page, searchString, launchPuppeteer
.attr('disabled'); .attr('disabled');
}, nextButtonSelector); }, nextButtonSelector);
const noResultsEl = await page.$('.section-no-result-title'); const noResultsEl = await page.$('.section-no-result-title');
if (isNextPaginationDisabled || noResultsEl || (maxRequestsPerCrawl && maxRequestsPerCrawl < to)) { if (isNextPaginationDisabled || noResultsEl || (maxPlacesPerCrawl && maxPlacesPerCrawl < to)) {
break; break;
} else { } else {
// NOTE: puppeteer API click() didn't work :( // NOTE: puppeteer API click() didn't work :(

View File

@ -42,7 +42,7 @@ const setUpCrawler = (launchPuppeteerOptions, requestQueue, maxCrawledPlaces) =>
// Store state of listing pagination // Store state of listing pagination
// NOTE: Ensured - If pageFunction failed crawler skipped already scraped pagination // NOTE: Ensured - If pageFunction failed crawler skipped already scraped pagination
const listingPagination = await Apify.getValue(LISTING_PAGINATION_KEY) || {}; const listingPagination = await Apify.getValue(LISTING_PAGINATION_KEY) || {};
await enqueueAllPlaceDetailsCrawler.run(page, searchString, launchPuppeteerOptions, requestQueue, listingPagination, crawlerOpts.maxRequestsPerCrawl); await enqueueAllPlaceDetailsCrawler.run(page, searchString, launchPuppeteerOptions, requestQueue, listingPagination, maxCrawledPlaces);
listingPagination.isFinish = true; listingPagination.isFinish = true;
await Apify.setValue(LISTING_PAGINATION_KEY, listingPagination); await Apify.setValue(LISTING_PAGINATION_KEY, listingPagination);
} else { } else {