mirror of
https://github.com/davidjohnbarton/crawler-google-places.git
synced 2025-12-12 16:38:45 +00:00
Added max crawled places options
This commit is contained in:
parent
0689be833c
commit
e6612c9743
|
|
@ -18,6 +18,12 @@
|
||||||
"prefill": { "useApifyProxy": true },
|
"prefill": { "useApifyProxy": true },
|
||||||
"editor": "proxy"
|
"editor": "proxy"
|
||||||
},
|
},
|
||||||
|
"maxCrawledPlaces": {
|
||||||
|
"title": "Max crawled places",
|
||||||
|
"type": "integer",
|
||||||
|
"description": "Use to limit places you want to get from crawler. If you fill 0 or nothing all places will be scrape",
|
||||||
|
"minimum": 0
|
||||||
|
},
|
||||||
"lat": {
|
"lat": {
|
||||||
"title": "Viewport Latitude",
|
"title": "Viewport Latitude",
|
||||||
"type": "string",
|
"type": "string",
|
||||||
|
|
|
||||||
|
|
@ -31,14 +31,14 @@ const enqueueAllUrlsFromPagination = async (page, requestQueue) => {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Crawler add all place detail from listing to queue
|
* Crawler add all place detail from listing to queue
|
||||||
* @param startUrl
|
* @param page
|
||||||
* @param searchString
|
* @param searchString
|
||||||
* @param launchPuppeteerOptions
|
* @param launchPuppeteerOptions
|
||||||
* @param requestQueue
|
* @param requestQueue
|
||||||
* @param listingPagination
|
* @param listingPagination
|
||||||
* @param retries
|
* @param maxRequestsPerCrawl
|
||||||
*/
|
*/
|
||||||
const enqueueAllPlaceDetailsCrawler = async (page, searchString, launchPuppeteerOptions, requestQueue, listingPagination) => {
|
const enqueueAllPlaceDetailsCrawler = async (page, searchString, launchPuppeteerOptions, requestQueue, listingPagination, maxRequestsPerCrawl) => {
|
||||||
await page.type('#searchboxinput', searchString);
|
await page.type('#searchboxinput', searchString);
|
||||||
await sleep(5000);
|
await sleep(5000);
|
||||||
await page.click('#searchbox-searchbutton');
|
await page.click('#searchbox-searchbutton');
|
||||||
|
|
@ -77,7 +77,7 @@ const enqueueAllPlaceDetailsCrawler = async (page, searchString, launchPuppeteer
|
||||||
.attr('disabled');
|
.attr('disabled');
|
||||||
}, nextButtonSelector);
|
}, nextButtonSelector);
|
||||||
const noResultsEl = await page.$('.section-no-result-title');
|
const noResultsEl = await page.$('.section-no-result-title');
|
||||||
if (isNextPaginationDisabled || noResultsEl) {
|
if (isNextPaginationDisabled || noResultsEl || (maxRequestsPerCrawl && maxRequestsPerCrawl < to)) {
|
||||||
break;
|
break;
|
||||||
} else {
|
} else {
|
||||||
// NOTE: puppeteer API click() didn't work :(
|
// NOTE: puppeteer API click() didn't work :(
|
||||||
|
|
|
||||||
|
|
@ -3,7 +3,7 @@ const placesCrawler = require('./places_crawler');
|
||||||
|
|
||||||
Apify.main(async () => {
|
Apify.main(async () => {
|
||||||
const input = await Apify.getValue('INPUT');
|
const input = await Apify.getValue('INPUT');
|
||||||
const { searchString, proxyConfig, lat, lng } = input;
|
const { searchString, proxyConfig, lat, lng, maxCrawledPlaces } = input;
|
||||||
|
|
||||||
if (!searchString) throw new Error('Attribute searchString missing in input.');
|
if (!searchString) throw new Error('Attribute searchString missing in input.');
|
||||||
|
|
||||||
|
|
@ -26,7 +26,7 @@ Apify.main(async () => {
|
||||||
if (proxyConfig) Object.assign(launchPuppeteerOptions, proxyConfig);
|
if (proxyConfig) Object.assign(launchPuppeteerOptions, proxyConfig);
|
||||||
|
|
||||||
// Scrape all place detail links
|
// Scrape all place detail links
|
||||||
const crawler = placesCrawler.setUpCrawler(launchPuppeteerOptions, requestQueue);
|
const crawler = placesCrawler.setUpCrawler(launchPuppeteerOptions, requestQueue, maxCrawledPlaces);
|
||||||
await crawler.run();
|
await crawler.run();
|
||||||
|
|
||||||
console.log('Done!');
|
console.log('Done!');
|
||||||
|
|
|
||||||
|
|
@ -11,10 +11,11 @@ const enqueueAllPlaceDetailsCrawler = require('./enqueue_places_crawler');
|
||||||
* Method to set up crawler to get all place details and save them to default dataset
|
* Method to set up crawler to get all place details and save them to default dataset
|
||||||
* @param launchPuppeteerOptions
|
* @param launchPuppeteerOptions
|
||||||
* @param requestQueue
|
* @param requestQueue
|
||||||
|
* @param maxCrawledPlaces
|
||||||
* @return {Apify.PuppeteerCrawler}
|
* @return {Apify.PuppeteerCrawler}
|
||||||
*/
|
*/
|
||||||
const setUpCrawler = (launchPuppeteerOptions, requestQueue) => {
|
const setUpCrawler = (launchPuppeteerOptions, requestQueue, maxCrawledPlaces) => {
|
||||||
return new Apify.PuppeteerCrawler({
|
const crawlerOpts = {
|
||||||
launchPuppeteerOptions,
|
launchPuppeteerOptions,
|
||||||
requestQueue,
|
requestQueue,
|
||||||
maxRequestRetries: MAX_PAGE_RETRIES,
|
maxRequestRetries: MAX_PAGE_RETRIES,
|
||||||
|
|
@ -22,6 +23,11 @@ const setUpCrawler = (launchPuppeteerOptions, requestQueue) => {
|
||||||
handlePageTimeoutSecs: 2 * 3600, // Two hours because startUrl crawler
|
handlePageTimeoutSecs: 2 * 3600, // Two hours because startUrl crawler
|
||||||
maxOpenPagesPerInstance: 1, // Because startUrl crawler crashes if we mixed it with details scraping
|
maxOpenPagesPerInstance: 1, // Because startUrl crawler crashes if we mixed it with details scraping
|
||||||
// maxConcurrency: 1,
|
// maxConcurrency: 1,
|
||||||
|
};
|
||||||
|
if (maxCrawledPlaces) {
|
||||||
|
crawlerOpts.maxRequestsPerCrawl = maxCrawledPlaces + 1; // The first one is startUrl
|
||||||
|
}
|
||||||
|
return new Apify.PuppeteerCrawler(Object.assign(crawlerOpts, {
|
||||||
gotoFunction: async ({ request, page }) => {
|
gotoFunction: async ({ request, page }) => {
|
||||||
await page._client.send('Emulation.clearDeviceMetricsOverride');
|
await page._client.send('Emulation.clearDeviceMetricsOverride');
|
||||||
await page.goto(request.url, { timeout: 60000 });
|
await page.goto(request.url, { timeout: 60000 });
|
||||||
|
|
@ -36,7 +42,7 @@ const setUpCrawler = (launchPuppeteerOptions, requestQueue) => {
|
||||||
// Store state of listing pagination
|
// Store state of listing pagination
|
||||||
// NOTE: Ensured - If pageFunction failed crawler skipped already scraped pagination
|
// NOTE: Ensured - If pageFunction failed crawler skipped already scraped pagination
|
||||||
const listingPagination = await Apify.getValue(LISTING_PAGINATION_KEY) || {};
|
const listingPagination = await Apify.getValue(LISTING_PAGINATION_KEY) || {};
|
||||||
await enqueueAllPlaceDetailsCrawler.run(page, searchString, launchPuppeteerOptions, requestQueue, listingPagination);
|
await enqueueAllPlaceDetailsCrawler.run(page, searchString, launchPuppeteerOptions, requestQueue, listingPagination, crawlerOpts.maxRequestsPerCrawl);
|
||||||
listingPagination.isFinish = true;
|
listingPagination.isFinish = true;
|
||||||
await Apify.setValue(LISTING_PAGINATION_KEY, listingPagination);
|
await Apify.setValue(LISTING_PAGINATION_KEY, listingPagination);
|
||||||
} else {
|
} else {
|
||||||
|
|
@ -113,7 +119,7 @@ const setUpCrawler = (launchPuppeteerOptions, requestQueue) => {
|
||||||
errors: request.errorMessages,
|
errors: request.errorMessages,
|
||||||
});
|
});
|
||||||
},
|
},
|
||||||
});
|
}));
|
||||||
};
|
};
|
||||||
|
|
||||||
module.exports = { setUpCrawler };
|
module.exports = { setUpCrawler };
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue
Block a user