mirror of
https://github.com/davidjohnbarton/crawler-google-places.git
synced 2025-12-12 16:38:45 +00:00
Added max crawled places options
This commit is contained in:
parent
0689be833c
commit
e6612c9743
|
|
@ -18,6 +18,12 @@
|
|||
"prefill": { "useApifyProxy": true },
|
||||
"editor": "proxy"
|
||||
},
|
||||
"maxCrawledPlaces": {
|
||||
"title": "Max crawled places",
|
||||
"type": "integer",
|
||||
"description": "Use to limit places you want to get from crawler. If you fill 0 or nothing all places will be scrape",
|
||||
"minimum": 0
|
||||
},
|
||||
"lat": {
|
||||
"title": "Viewport Latitude",
|
||||
"type": "string",
|
||||
|
|
|
|||
|
|
@ -31,14 +31,14 @@ const enqueueAllUrlsFromPagination = async (page, requestQueue) => {
|
|||
|
||||
/**
|
||||
* Crawler add all place detail from listing to queue
|
||||
* @param startUrl
|
||||
* @param page
|
||||
* @param searchString
|
||||
* @param launchPuppeteerOptions
|
||||
* @param requestQueue
|
||||
* @param listingPagination
|
||||
* @param retries
|
||||
* @param maxRequestsPerCrawl
|
||||
*/
|
||||
const enqueueAllPlaceDetailsCrawler = async (page, searchString, launchPuppeteerOptions, requestQueue, listingPagination) => {
|
||||
const enqueueAllPlaceDetailsCrawler = async (page, searchString, launchPuppeteerOptions, requestQueue, listingPagination, maxRequestsPerCrawl) => {
|
||||
await page.type('#searchboxinput', searchString);
|
||||
await sleep(5000);
|
||||
await page.click('#searchbox-searchbutton');
|
||||
|
|
@ -77,7 +77,7 @@ const enqueueAllPlaceDetailsCrawler = async (page, searchString, launchPuppeteer
|
|||
.attr('disabled');
|
||||
}, nextButtonSelector);
|
||||
const noResultsEl = await page.$('.section-no-result-title');
|
||||
if (isNextPaginationDisabled || noResultsEl) {
|
||||
if (isNextPaginationDisabled || noResultsEl || (maxRequestsPerCrawl && maxRequestsPerCrawl < to)) {
|
||||
break;
|
||||
} else {
|
||||
// NOTE: puppeteer API click() didn't work :(
|
||||
|
|
|
|||
|
|
@ -3,7 +3,7 @@ const placesCrawler = require('./places_crawler');
|
|||
|
||||
Apify.main(async () => {
|
||||
const input = await Apify.getValue('INPUT');
|
||||
const { searchString, proxyConfig, lat, lng } = input;
|
||||
const { searchString, proxyConfig, lat, lng, maxCrawledPlaces } = input;
|
||||
|
||||
if (!searchString) throw new Error('Attribute searchString missing in input.');
|
||||
|
||||
|
|
@ -26,7 +26,7 @@ Apify.main(async () => {
|
|||
if (proxyConfig) Object.assign(launchPuppeteerOptions, proxyConfig);
|
||||
|
||||
// Scrape all place detail links
|
||||
const crawler = placesCrawler.setUpCrawler(launchPuppeteerOptions, requestQueue);
|
||||
const crawler = placesCrawler.setUpCrawler(launchPuppeteerOptions, requestQueue, maxCrawledPlaces);
|
||||
await crawler.run();
|
||||
|
||||
console.log('Done!');
|
||||
|
|
|
|||
|
|
@ -11,10 +11,11 @@ const enqueueAllPlaceDetailsCrawler = require('./enqueue_places_crawler');
|
|||
* Method to set up crawler to get all place details and save them to default dataset
|
||||
* @param launchPuppeteerOptions
|
||||
* @param requestQueue
|
||||
* @param maxCrawledPlaces
|
||||
* @return {Apify.PuppeteerCrawler}
|
||||
*/
|
||||
const setUpCrawler = (launchPuppeteerOptions, requestQueue) => {
|
||||
return new Apify.PuppeteerCrawler({
|
||||
const setUpCrawler = (launchPuppeteerOptions, requestQueue, maxCrawledPlaces) => {
|
||||
const crawlerOpts = {
|
||||
launchPuppeteerOptions,
|
||||
requestQueue,
|
||||
maxRequestRetries: MAX_PAGE_RETRIES,
|
||||
|
|
@ -22,6 +23,11 @@ const setUpCrawler = (launchPuppeteerOptions, requestQueue) => {
|
|||
handlePageTimeoutSecs: 2 * 3600, // Two hours because startUrl crawler
|
||||
maxOpenPagesPerInstance: 1, // Because startUrl crawler crashes if we mixed it with details scraping
|
||||
// maxConcurrency: 1,
|
||||
};
|
||||
if (maxCrawledPlaces) {
|
||||
crawlerOpts.maxRequestsPerCrawl = maxCrawledPlaces + 1; // The first one is startUrl
|
||||
}
|
||||
return new Apify.PuppeteerCrawler(Object.assign(crawlerOpts, {
|
||||
gotoFunction: async ({ request, page }) => {
|
||||
await page._client.send('Emulation.clearDeviceMetricsOverride');
|
||||
await page.goto(request.url, { timeout: 60000 });
|
||||
|
|
@ -36,7 +42,7 @@ const setUpCrawler = (launchPuppeteerOptions, requestQueue) => {
|
|||
// Store state of listing pagination
|
||||
// NOTE: Ensured - If pageFunction failed crawler skipped already scraped pagination
|
||||
const listingPagination = await Apify.getValue(LISTING_PAGINATION_KEY) || {};
|
||||
await enqueueAllPlaceDetailsCrawler.run(page, searchString, launchPuppeteerOptions, requestQueue, listingPagination);
|
||||
await enqueueAllPlaceDetailsCrawler.run(page, searchString, launchPuppeteerOptions, requestQueue, listingPagination, crawlerOpts.maxRequestsPerCrawl);
|
||||
listingPagination.isFinish = true;
|
||||
await Apify.setValue(LISTING_PAGINATION_KEY, listingPagination);
|
||||
} else {
|
||||
|
|
@ -113,7 +119,7 @@ const setUpCrawler = (launchPuppeteerOptions, requestQueue) => {
|
|||
errors: request.errorMessages,
|
||||
});
|
||||
},
|
||||
});
|
||||
}));
|
||||
};
|
||||
|
||||
module.exports = { setUpCrawler };
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user