mirror of
https://github.com/davidjohnbarton/crawler-google-places.git
synced 2025-12-12 16:38:45 +00:00
Updated option max crawled places
This commit is contained in:
parent
c8a232c93f
commit
93719c924f
|
|
@ -7,7 +7,7 @@ const waitForGoogleMapLoader = (page) => page.waitFor(() => !document.querySelec
|
|||
.classList
|
||||
.contains('loading'), { timeout: DEFAULT_TIMEOUT });
|
||||
|
||||
const enqueueAllUrlsFromPagination = async (page, requestQueue) => {
|
||||
const enqueueAllUrlsFromPagination = async (page, requestQueue, paginationFrom, maxPlacesPerCrawl) => {
|
||||
let results = await page.$$('.section-result');
|
||||
const resultsCount = results.length;
|
||||
for (let resultIndex = 0; resultIndex < resultsCount; resultIndex++) {
|
||||
|
|
@ -25,6 +25,10 @@ const enqueueAllUrlsFromPagination = async (page, requestQueue) => {
|
|||
const url = page.url();
|
||||
await requestQueue.addRequest({ url, userData: { label: 'detail' } });
|
||||
console.log(`Added to queue ${url}`);
|
||||
if (maxPlacesPerCrawl && paginationFrom + resultIndex + 1 > maxPlacesPerCrawl) {
|
||||
console.log(`Reach max places per crawl ${maxPlacesPerCrawl}, stopped enqueuing new places.`);
|
||||
break;
|
||||
}
|
||||
await page.click('.section-back-to-list-button');
|
||||
}
|
||||
};
|
||||
|
|
@ -36,9 +40,9 @@ const enqueueAllUrlsFromPagination = async (page, requestQueue) => {
|
|||
* @param launchPuppeteerOptions
|
||||
* @param requestQueue
|
||||
* @param listingPagination
|
||||
* @param maxRequestsPerCrawl
|
||||
* @param maxPlacesPerCrawl
|
||||
*/
|
||||
const enqueueAllPlaceDetailsCrawler = async (page, searchString, launchPuppeteerOptions, requestQueue, listingPagination, maxRequestsPerCrawl) => {
|
||||
const enqueueAllPlaceDetailsCrawler = async (page, searchString, launchPuppeteerOptions, requestQueue, listingPagination, maxPlacesPerCrawl) => {
|
||||
await page.type('#searchboxinput', searchString);
|
||||
await sleep(5000);
|
||||
await page.click('#searchbox-searchbutton');
|
||||
|
|
@ -67,7 +71,7 @@ const enqueueAllPlaceDetailsCrawler = async (page, searchString, launchPuppeteer
|
|||
console.log(`Skiped pagination ${from} - ${to}, already done!`);
|
||||
} else {
|
||||
console.log(`Added links from pagination ${from} - ${to}`);
|
||||
await enqueueAllUrlsFromPagination(page, requestQueue);
|
||||
await enqueueAllUrlsFromPagination(page, requestQueue, from, maxPlacesPerCrawl);
|
||||
listingPagination = { from, to };
|
||||
await Apify.setValue(LISTING_PAGINATION_KEY, listingPagination);
|
||||
}
|
||||
|
|
@ -77,7 +81,7 @@ const enqueueAllPlaceDetailsCrawler = async (page, searchString, launchPuppeteer
|
|||
.attr('disabled');
|
||||
}, nextButtonSelector);
|
||||
const noResultsEl = await page.$('.section-no-result-title');
|
||||
if (isNextPaginationDisabled || noResultsEl || (maxRequestsPerCrawl && maxRequestsPerCrawl < to)) {
|
||||
if (isNextPaginationDisabled || noResultsEl || (maxPlacesPerCrawl && maxPlacesPerCrawl < to)) {
|
||||
break;
|
||||
} else {
|
||||
// NOTE: puppeteer API click() didn't work :(
|
||||
|
|
|
|||
|
|
@ -42,7 +42,7 @@ const setUpCrawler = (launchPuppeteerOptions, requestQueue, maxCrawledPlaces) =>
|
|||
// Store state of listing pagination
|
||||
// NOTE: Ensured - If pageFunction failed crawler skipped already scraped pagination
|
||||
const listingPagination = await Apify.getValue(LISTING_PAGINATION_KEY) || {};
|
||||
await enqueueAllPlaceDetailsCrawler.run(page, searchString, launchPuppeteerOptions, requestQueue, listingPagination, crawlerOpts.maxRequestsPerCrawl);
|
||||
await enqueueAllPlaceDetailsCrawler.run(page, searchString, launchPuppeteerOptions, requestQueue, listingPagination, maxCrawledPlaces);
|
||||
listingPagination.isFinish = true;
|
||||
await Apify.setValue(LISTING_PAGINATION_KEY, listingPagination);
|
||||
} else {
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user