mirror of
https://github.com/davidjohnbarton/crawler-google-places.git
synced 2025-12-12 16:38:45 +00:00
Several updates
This commit is contained in:
parent
d7a08eb19c
commit
12950f6b74
68
README.md
68
README.md
|
|
@ -1,22 +1,80 @@
|
||||||
# Crawler Google Places
|
# Crawler Google Places
|
||||||
Crawler searches input on [google maps](https://www.google.com/maps) and return all information about found places.
|
Get data from Google Places, which official [Google Maps Places API](https://developers.google.com/places/web-service/search) does not provide.
|
||||||
|
|
||||||
## How to use through API
|
## Why?
|
||||||
How to use Actor from Apify UI/API see [actor detail page](https://www.apify.com/drobnikj/crawler-google-places).
|
You can use official [Google Maps Places API](https://developers.google.com/places/web-service/search), it is better way for the most use cases.
|
||||||
|
But API doesn't provide everything:
|
||||||
|
|
||||||
|
- Popular place times histogram
|
||||||
|
- Place reviews (you can get up to 5 reviews from official API)
|
||||||
|
- Place photos (you can can up to 10 photos from official API)
|
||||||
|
|
||||||
|
## INPUT
|
||||||
|
Follow guide on [actor detail page](https://www.apify.com/drobnikj/crawler-google-places) to see how it works.
|
||||||
|
|
||||||
Example input:
|
Example input:
|
||||||
```json
|
```json
|
||||||
{
|
{
|
||||||
"searchString": "ČSOB",
|
"searchString": "pubs near prague",
|
||||||
"lat": "50.0860729",
|
"lat": "50.0860729",
|
||||||
"lng": "14.4135326",
|
"lng": "14.4135326",
|
||||||
"zoom": 10
|
"zoom": 10
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
On this input actor searches places on this start url: https://www.google.com/maps/search/%C4%8Dsob/@50.0860729,14.4135326,10z
|
On this input actor searches places on this start url: https://www.google.com/maps/search/pubs+near+prague/@50.0860729,14.4135326,10z
|
||||||
|
|
||||||
- `searchString` - String will be search on Google maps
|
- `searchString` - String will be search on Google maps
|
||||||
- `proxyConfig` - Apify proxy configuration
|
- `proxyConfig` - Apify proxy configuration
|
||||||
- `lat` - Viewport latitude
|
- `lat` - Viewport latitude
|
||||||
- `lng` - Viewport longitude
|
- `lng` - Viewport longitude
|
||||||
- `zoom` - Viewport zoom, e.g zoom: 10 -> https://www.google.com/maps/@50.0860729,14.4135326,10z vs zoom: 1 -> https://www.google.com/maps/@50.0860729,14.4135326,10z
|
- `zoom` - Viewport zoom, e.g zoom: 10 -> https://www.google.com/maps/@50.0860729,14.4135326,10z vs zoom: 1 -> https://www.google.com/maps/@50.0860729,14.4135326,10z
|
||||||
|
- `maxCrawledPlaces` - Limit places you want to get from crawler
|
||||||
|
|
||||||
|
## OUTPUT
|
||||||
|
Once the actor finishes, it outputs results to actor default dataset.
|
||||||
|
|
||||||
|
Example output for place
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"title": "Československá obchodní banka, a.s., pobočka Praha 5",
|
||||||
|
"totalScore": "1,7",
|
||||||
|
"categoryName": "Banka",
|
||||||
|
"address": "Arbesovo nám. 257/7, 150 00 Praha-Smíchov, Česko",
|
||||||
|
"plusCode": "3CG3+FQ Praha, Česko",
|
||||||
|
"url": "https://www.google.com/maps/place/%C4%8Ceskoslovensk%C3%A1+obchodn%C3%AD+banka,+a.s.,+pobo%C4%8Dka+Praha+5/@50.057267,14.4057479,12z/data=!4m8!1m2!2m1!1s%C4%8CSOB+near+prague!3m4!1s0x470bbe214008b6b3:0xcfe23567651c421!8m2!3d50.0762085!4d14.4044139",
|
||||||
|
"reviews": [
|
||||||
|
{
|
||||||
|
"name": "Anton Vasilyev",
|
||||||
|
"text": "V této pobočce pracuji vážená paní Alena Ladrova, která je prý odborníkem a na kterou jsem se musel obrátit třikrát. Vážená paní Ladrova si dovoluje nepřípustnou komunikaci se zákazníky, vyjadřuje svůj osobní názor na situaci, která musí být vyřešena. Několikrát řekla, že mi nesmí sloužit, protože na to nemá čas, a poradila mi, abych se obrátil na jiné oddělení. S tou zaměstnankyní jsem velmi nespokojený a jsem si jistý, že se s ní nikdy příště nechci nic řešit.",
|
||||||
|
"stars": "1 hvězdička",
|
||||||
|
"publishAt": "před 6 měsíci",
|
||||||
|
"likesCount": "",
|
||||||
|
"responseFromOwnerText": ""
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "Jan Beneš III.",
|
||||||
|
"text": "Fronta, otevřená jen jedna přepážka ze čtyř. Takhle se má chovat správce mých peněz?",
|
||||||
|
"stars": "1 hvězdička",
|
||||||
|
"publishAt": "před rokem",
|
||||||
|
"likesCount": "",
|
||||||
|
"responseFromOwnerText": ""
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "Roman Pauler",
|
||||||
|
"text": "",
|
||||||
|
"stars": "3 hvězdičky",
|
||||||
|
"publishAt": "před 2 lety",
|
||||||
|
"likesCount": "",
|
||||||
|
"responseFromOwnerText": ""
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"reviewsCount": "3",
|
||||||
|
"imageUrls": [
|
||||||
|
"https://lh5.googleusercontent.com/p/AF1QipPyQSZsrNgb21kyUhOtIZyh2oZfm6W9L39bjgnF=s790-k-no",
|
||||||
|
"https://lh5.googleusercontent.com/p/AF1QipM4GBWM4A8vl11qB4TSkRMACIPZ_hDDdY2NpXzo=s1056-k-no-pi-2.9338646-ya349.5-ro0-fo100",
|
||||||
|
"https://lh5.googleusercontent.com/p/AF1QipO3_cyeFWmGgxGF8No4FA1GtjXRMhxhwWeyzDo8=s555-k-no",
|
||||||
|
"https://lh5.googleusercontent.com/p/AF1QipMZ6UMzgg9NOfZNqgkbpfpBQlL6sqJi7p4WCAk=s312-k-no",
|
||||||
|
"https://geo1.ggpht.com/cbk?output=thumbnail&panoid=kq0AYAuZW_FmNyOro5nAfQ&minw=1071&minh=528&thumb=2&yaw=258.04102&pitch=0"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
|
||||||
|
|
@ -1,11 +1,10 @@
|
||||||
const Apify = require('apify');
|
const Apify = require('apify');
|
||||||
|
|
||||||
const { sleep } = Apify.utils;
|
const { sleep, log } = Apify.utils;
|
||||||
const { DEFAULT_TIMEOUT, LISTING_PAGINATION_KEY } = require('./consts');
|
const { DEFAULT_TIMEOUT, LISTING_PAGINATION_KEY } = require('./consts');
|
||||||
|
|
||||||
const waitForGoogleMapLoader = (page) => page.waitFor(() => !document.querySelector('#searchbox')
|
const waitForGoogleMapLoader = (page) => page.waitFor(() => !document.querySelector('#searchbox')
|
||||||
.classList
|
.classList.contains('loading'), { timeout: DEFAULT_TIMEOUT });
|
||||||
.contains('loading'), { timeout: DEFAULT_TIMEOUT });
|
|
||||||
|
|
||||||
const enqueueAllUrlsFromPagination = async (page, requestQueue, paginationFrom, maxPlacesPerCrawl) => {
|
const enqueueAllUrlsFromPagination = async (page, requestQueue, paginationFrom, maxPlacesPerCrawl) => {
|
||||||
let results = await page.$$('.section-result');
|
let results = await page.$$('.section-result');
|
||||||
|
|
@ -24,9 +23,9 @@ const enqueueAllUrlsFromPagination = async (page, requestQueue, paginationFrom,
|
||||||
await page.waitForSelector('.section-back-to-list-button', { timeout: DEFAULT_TIMEOUT });
|
await page.waitForSelector('.section-back-to-list-button', { timeout: DEFAULT_TIMEOUT });
|
||||||
const url = page.url();
|
const url = page.url();
|
||||||
await requestQueue.addRequest({ url, userData: { label: 'detail' } });
|
await requestQueue.addRequest({ url, userData: { label: 'detail' } });
|
||||||
console.log(`Added to queue ${url}`);
|
log.info(`Added to queue ${url}`);
|
||||||
if (maxPlacesPerCrawl && paginationFrom + resultIndex + 1 > maxPlacesPerCrawl) {
|
if (maxPlacesPerCrawl && paginationFrom + resultIndex + 1 > maxPlacesPerCrawl) {
|
||||||
console.log(`Reach max places per crawl ${maxPlacesPerCrawl}, stopped enqueuing new places.`);
|
log.info(`Reach max places per crawl ${maxPlacesPerCrawl}, stopped enqueuing new places.`);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
await page.click('.section-back-to-list-button');
|
await page.click('.section-back-to-list-button');
|
||||||
|
|
@ -48,12 +47,12 @@ const enqueueAllPlaceDetailsCrawler = async (page, searchString, launchPuppeteer
|
||||||
await page.click('#searchbox-searchbutton');
|
await page.click('#searchbox-searchbutton');
|
||||||
await sleep(5000);
|
await sleep(5000);
|
||||||
await waitForGoogleMapLoader(page);
|
await waitForGoogleMapLoader(page);
|
||||||
// In case there is no listing, put just detail page to queue
|
|
||||||
try {
|
try {
|
||||||
await page.waitForSelector('h1.section-hero-header-title');
|
await page.waitForSelector('h1.section-hero-header-title');
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
// It can happen, doesn't matter
|
// It can happen, doesn't matter :)
|
||||||
}
|
}
|
||||||
|
// In case there is no listing, put just detail page to queue
|
||||||
const maybeDetailPlace = await page.$('h1.section-hero-header-title');
|
const maybeDetailPlace = await page.$('h1.section-hero-header-title');
|
||||||
if (maybeDetailPlace) {
|
if (maybeDetailPlace) {
|
||||||
const url = page.url();
|
const url = page.url();
|
||||||
|
|
@ -68,25 +67,23 @@ const enqueueAllPlaceDetailsCrawler = async (page, searchString, launchPuppeteer
|
||||||
const from = parseInt(fromString);
|
const from = parseInt(fromString);
|
||||||
const to = parseInt(toString);
|
const to = parseInt(toString);
|
||||||
if (listingPagination.from && from <= listingPagination.from) {
|
if (listingPagination.from && from <= listingPagination.from) {
|
||||||
console.log(`Skiped pagination ${from} - ${to}, already done!`);
|
log.debug(`Skiped pagination ${from} - ${to}, already done!`);
|
||||||
} else {
|
} else {
|
||||||
console.log(`Added links from pagination ${from} - ${to}`);
|
log.debug(`Added links from pagination ${from} - ${to}`);
|
||||||
await enqueueAllUrlsFromPagination(page, requestQueue, from, maxPlacesPerCrawl);
|
await enqueueAllUrlsFromPagination(page, requestQueue, from, maxPlacesPerCrawl);
|
||||||
listingPagination = { from, to };
|
listingPagination = { from, to };
|
||||||
await Apify.setValue(LISTING_PAGINATION_KEY, listingPagination);
|
await Apify.setValue(LISTING_PAGINATION_KEY, listingPagination);
|
||||||
}
|
}
|
||||||
await page.waitForSelector(nextButtonSelector, { timeout: DEFAULT_TIMEOUT });
|
await page.waitForSelector(nextButtonSelector, { timeout: DEFAULT_TIMEOUT });
|
||||||
const isNextPaginationDisabled = await page.evaluate((nextButtonSelector) => {
|
const isNextPaginationDisabled = await page.evaluate((nextButtonSelector) => {
|
||||||
return !!$(nextButtonSelector)
|
return !!$(nextButtonSelector).attr('disabled');
|
||||||
.attr('disabled');
|
|
||||||
}, nextButtonSelector);
|
}, nextButtonSelector);
|
||||||
const noResultsEl = await page.$('.section-no-result-title');
|
const noResultsEl = await page.$('.section-no-result-title');
|
||||||
if (isNextPaginationDisabled || noResultsEl || (maxPlacesPerCrawl && maxPlacesPerCrawl < to)) {
|
if (isNextPaginationDisabled || noResultsEl || (maxPlacesPerCrawl && maxPlacesPerCrawl < to)) {
|
||||||
break;
|
break;
|
||||||
} else {
|
} else {
|
||||||
// NOTE: puppeteer API click() didn't work :(
|
// NOTE: puppeteer API click() didn't work :|
|
||||||
await page.evaluate((sel) => $(sel)
|
await page.evaluate((sel) => $(sel).click(), nextButtonSelector);
|
||||||
.click(), nextButtonSelector);
|
|
||||||
await waitForGoogleMapLoader(page);
|
await waitForGoogleMapLoader(page);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,6 @@
|
||||||
const Apify = require('apify');
|
const Apify = require('apify');
|
||||||
const placesCrawler = require('./places_crawler');
|
const placesCrawler = require('./places_crawler');
|
||||||
|
const { log } = Apify.utils;
|
||||||
|
|
||||||
Apify.main(async () => {
|
Apify.main(async () => {
|
||||||
const input = await Apify.getValue('INPUT');
|
const input = await Apify.getValue('INPUT');
|
||||||
|
|
@ -7,7 +8,7 @@ Apify.main(async () => {
|
||||||
|
|
||||||
if (!searchString) throw new Error('Attribute searchString missing in input.');
|
if (!searchString) throw new Error('Attribute searchString missing in input.');
|
||||||
|
|
||||||
console.log('Scraping Google Places for search string:', searchString);
|
log.info('Scraping Google Places for search string:', searchString);
|
||||||
|
|
||||||
let startUrl;
|
let startUrl;
|
||||||
if (lat || lng) {
|
if (lat || lng) {
|
||||||
|
|
@ -18,7 +19,7 @@ Apify.main(async () => {
|
||||||
startUrl = 'https://www.google.com/maps/search/';
|
startUrl = 'https://www.google.com/maps/search/';
|
||||||
}
|
}
|
||||||
|
|
||||||
console.log('Start url is', startUrl);
|
log.info('Start url is', startUrl);
|
||||||
const requestQueue = await Apify.openRequestQueue();
|
const requestQueue = await Apify.openRequestQueue();
|
||||||
await requestQueue.addRequest({ url: startUrl, userData: { label: 'startUrl', searchString } });
|
await requestQueue.addRequest({ url: startUrl, userData: { label: 'startUrl', searchString } });
|
||||||
|
|
||||||
|
|
@ -29,5 +30,5 @@ Apify.main(async () => {
|
||||||
const crawler = placesCrawler.setUpCrawler(launchPuppeteerOptions, requestQueue, maxCrawledPlaces);
|
const crawler = placesCrawler.setUpCrawler(launchPuppeteerOptions, requestQueue, maxCrawledPlaces);
|
||||||
await crawler.run();
|
await crawler.run();
|
||||||
|
|
||||||
console.log('Done!');
|
log.info('Done!');
|
||||||
});
|
});
|
||||||
|
|
|
||||||
|
|
@ -1,12 +1,11 @@
|
||||||
const Apify = require('apify');
|
const Apify = require('apify');
|
||||||
|
|
||||||
const { sleep } = Apify.utils;
|
const { sleep, log } = Apify.utils;
|
||||||
const infiniteScroll = require('./infinite_scroll');
|
const infiniteScroll = require('./infinite_scroll');
|
||||||
|
|
||||||
const { injectJQuery } = Apify.utils.puppeteer;
|
const { injectJQuery } = Apify.utils.puppeteer;
|
||||||
const { MAX_PAGE_RETRIES, DEFAULT_TIMEOUT, LISTING_PAGINATION_KEY } = require('./consts');
|
const { MAX_PAGE_RETRIES, DEFAULT_TIMEOUT, LISTING_PAGINATION_KEY } = require('./consts');
|
||||||
const enqueueAllPlaceDetailsCrawler = require('./enqueue_places_crawler');
|
const enqueueAllPlaceDetailsCrawler = require('./enqueue_places_crawler');
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Method to set up crawler to get all place details and save them to default dataset
|
* Method to set up crawler to get all place details and save them to default dataset
|
||||||
* @param launchPuppeteerOptions
|
* @param launchPuppeteerOptions
|
||||||
|
|
@ -21,24 +20,25 @@ const setUpCrawler = (launchPuppeteerOptions, requestQueue, maxCrawledPlaces) =>
|
||||||
maxRequestRetries: MAX_PAGE_RETRIES,
|
maxRequestRetries: MAX_PAGE_RETRIES,
|
||||||
retireInstanceAfterRequestCount: 10,
|
retireInstanceAfterRequestCount: 10,
|
||||||
handlePageTimeoutSecs: 2 * 3600, // Two hours because startUrl crawler
|
handlePageTimeoutSecs: 2 * 3600, // Two hours because startUrl crawler
|
||||||
maxOpenPagesPerInstance: 1, // Because startUrl crawler crashes if we mixed it with details scraping
|
maxOpenPagesPerInstance: 1, // Because startUrl crawler crashes if we mixed tabs with details scraping
|
||||||
// maxConcurrency: 1,
|
// maxConcurrency: 1,
|
||||||
};
|
};
|
||||||
if (maxCrawledPlaces) {
|
if (maxCrawledPlaces) {
|
||||||
crawlerOpts.maxRequestsPerCrawl = maxCrawledPlaces + 1; // The first one is startUrl
|
crawlerOpts.maxRequestsPerCrawl = maxCrawledPlaces + 1; // The first one is startUrl
|
||||||
}
|
}
|
||||||
return new Apify.PuppeteerCrawler(Object.assign(crawlerOpts, {
|
return new Apify.PuppeteerCrawler({
|
||||||
|
...crawlerOpts,
|
||||||
gotoFunction: async ({ request, page }) => {
|
gotoFunction: async ({ request, page }) => {
|
||||||
await page._client.send('Emulation.clearDeviceMetricsOverride');
|
await page._client.send('Emulation.clearDeviceMetricsOverride');
|
||||||
await page.goto(request.url, { timeout: 60000 });
|
await page.goto(request.url, { timeout: 60000 });
|
||||||
},
|
},
|
||||||
handlePageFunction: async ({ request, page }) => {
|
handlePageFunction: async ({ request, page }) => {
|
||||||
const { label, searchString } = request.userData;
|
const { label, searchString } = request.userData;
|
||||||
console.log(`Open ${request.url} with label: ${label}`);
|
log.info(`Open ${request.url} with label: ${label}`);
|
||||||
await injectJQuery(page);
|
await injectJQuery(page);
|
||||||
if (label === 'startUrl') {
|
if (label === 'startUrl') {
|
||||||
// enqueue all places
|
// enqueue all places
|
||||||
console.log(`Start enqueuing place details for search: ${searchString}`);
|
log.info(`Start enqueuing place details for search: ${searchString}`);
|
||||||
// Store state of listing pagination
|
// Store state of listing pagination
|
||||||
// NOTE: Ensured - If pageFunction failed crawler skipped already scraped pagination
|
// NOTE: Ensured - If pageFunction failed crawler skipped already scraped pagination
|
||||||
const listingPagination = await Apify.getValue(LISTING_PAGINATION_KEY) || {};
|
const listingPagination = await Apify.getValue(LISTING_PAGINATION_KEY) || {};
|
||||||
|
|
@ -51,7 +51,8 @@ const setUpCrawler = (launchPuppeteerOptions, requestQueue, maxCrawledPlaces) =>
|
||||||
throw new Error('HandlePagefunction timed out!');
|
throw new Error('HandlePagefunction timed out!');
|
||||||
}, 600000);
|
}, 600000);
|
||||||
// Get data from review
|
// Get data from review
|
||||||
await page.waitForSelector('h1.section-hero-header-title', { timeout: DEFAULT_TIMEOUT });
|
const titleSel = 'h1.section-hero-header-title';
|
||||||
|
await page.waitForSelector(titleSel, { timeout: DEFAULT_TIMEOUT });
|
||||||
const placeDetail = await page.evaluate(() => {
|
const placeDetail = await page.evaluate(() => {
|
||||||
return {
|
return {
|
||||||
title: $('h1.section-hero-header-title').text().trim(),
|
title: $('h1.section-hero-header-title').text().trim(),
|
||||||
|
|
@ -62,33 +63,64 @@ const setUpCrawler = (launchPuppeteerOptions, requestQueue, maxCrawledPlaces) =>
|
||||||
};
|
};
|
||||||
});
|
});
|
||||||
placeDetail.url = request.url;
|
placeDetail.url = request.url;
|
||||||
placeDetail.reviews = [];
|
const histogramSel = '.section-popular-times';
|
||||||
if (placeDetail.totalScore) {
|
if (await page.$(histogramSel)) {
|
||||||
placeDetail.reviewsCount = await page.evaluate(() => {
|
placeDetail.popularTimesHistogram = await page.evaluate(() => {
|
||||||
const numberReviewsText = $('button.section-reviewchart-numreviews').text().trim();
|
const graphs = {};
|
||||||
return (numberReviewsText) ? numberReviewsText.match(/\d+/)[0] : null;
|
const days = ['Su', 'Mo', 'Tu', 'We', 'Th', 'Fr', 'Sa'];
|
||||||
|
// Days graphs
|
||||||
|
$('.section-popular-times-graph').each(function(i) {
|
||||||
|
const day = days[i];
|
||||||
|
graphs[day] = [];
|
||||||
|
let graphStartFromHour;
|
||||||
|
$(this).find('.section-popular-times-label').each(function(labelIndex) {
|
||||||
|
if (graphStartFromHour) return;
|
||||||
|
const hourText = $(this).text().trim();
|
||||||
|
graphStartFromHour = hourText.includes('p')
|
||||||
|
? 12 + (parseInt(hourText) - labelIndex)
|
||||||
|
: parseInt(hourText) - labelIndex;
|
||||||
|
});
|
||||||
|
$(this).find('.section-popular-times-bar').each(function (barIndex) {
|
||||||
|
const occupancy = $(this).attr('aria-label').match(/\d+\s{1,}%/)[0];
|
||||||
|
const maybeHour = graphStartFromHour + barIndex;
|
||||||
|
graphs[day].push({
|
||||||
|
hour: maybeHour > 24 ? maybeHour - 24 : maybeHour,
|
||||||
|
occupancy,
|
||||||
|
});
|
||||||
|
});
|
||||||
|
});
|
||||||
|
return graphs;
|
||||||
});
|
});
|
||||||
|
}
|
||||||
|
placeDetail.reviews = [];
|
||||||
|
const reviewsButtonSel = 'button[jsaction="pane.reviewChart.moreReviews"]';
|
||||||
|
if (placeDetail.totalScore) {
|
||||||
|
placeDetail.reviewsCount = await page.evaluate((selector) => {
|
||||||
|
const numberReviewsText = $(selector).text().trim();
|
||||||
|
return (numberReviewsText) ? numberReviewsText.match(/\d+/)[0] : null;
|
||||||
|
}, reviewsButtonSel);
|
||||||
// If we find consent dialog, close it!
|
// If we find consent dialog, close it!
|
||||||
if (await page.$('.widget-consent-dialog')) {
|
if (await page.$('.widget-consent-dialog')) {
|
||||||
await page.click('.widget-consent-dialog .widget-consent-button-later');
|
await page.click('.widget-consent-dialog .widget-consent-button-later');
|
||||||
}
|
}
|
||||||
// Get all reviews
|
// Get all reviews
|
||||||
await page.waitForSelector('button.section-reviewchart-numreviews')
|
await page.waitForSelector(reviewsButtonSel);
|
||||||
await page.click('button.section-reviewchart-numreviews');
|
await page.click(reviewsButtonSel);
|
||||||
await page.waitForSelector('.section-star-display', { timeout: DEFAULT_TIMEOUT });
|
await page.waitForSelector('.section-star-display', { timeout: DEFAULT_TIMEOUT });
|
||||||
await sleep(5000);
|
await sleep(5000);
|
||||||
// Sort reviews by newest, one click sometimes didn't work :)
|
// Sort reviews by newest, one click sometimes didn't work :)
|
||||||
try {
|
try {
|
||||||
await page.click('.section-tab-info-stats-button-flex');
|
const sortButtonEl = '.section-tab-info-stats-button-flex';
|
||||||
|
await page.click(sortButtonEl);
|
||||||
await sleep(1000);
|
await sleep(1000);
|
||||||
await page.click('.section-tab-info-stats-button-flex');
|
await page.click(sortButtonEl);
|
||||||
await sleep(1000);
|
await sleep(1000);
|
||||||
await page.click('.section-tab-info-stats-button-flex');
|
await page.click(sortButtonEl);
|
||||||
await sleep(5000);
|
await sleep(5000);
|
||||||
await page.click('.context-menu-entry[data-index="1"]');
|
await page.click('.context-menu-entry[data-index="1"]');
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
// It can happen, it is not big issue
|
// It can happen, it is not big issue :)
|
||||||
console.log('Cannot select reviews by newest!');
|
log.debug('Cannot select reviews by newest!');
|
||||||
}
|
}
|
||||||
await infiniteScroll(page, 99999999999, '.section-scrollbox.section-listbox');
|
await infiniteScroll(page, 99999999999, '.section-scrollbox.section-listbox');
|
||||||
const reviewEls = await page.$$('div.section-review');
|
const reviewEls = await page.$$('div.section-review');
|
||||||
|
|
@ -115,11 +147,29 @@ const setUpCrawler = (launchPuppeteerOptions, requestQueue, maxCrawledPlaces) =>
|
||||||
}, reviewEl);
|
}, reviewEl);
|
||||||
placeDetail.reviews.push(review);
|
placeDetail.reviews.push(review);
|
||||||
}
|
}
|
||||||
|
await page.click('button.section-header-back-button');
|
||||||
|
}
|
||||||
|
await page.waitForSelector(titleSel, { timeout: DEFAULT_TIMEOUT });
|
||||||
|
const imagesButtonSel = '[jsaction="pane.imagepack.button"]';
|
||||||
|
console.log(imagesButtonSel);
|
||||||
|
if (await page.$(imagesButtonSel)) {
|
||||||
|
await page.click(imagesButtonSel);
|
||||||
|
await infiniteScroll(page, 99999999999, '.section-scrollbox.section-listbox');
|
||||||
|
placeDetail.imageUrls = await page.evaluate(() => {
|
||||||
|
const urls = [];
|
||||||
|
$('.gallery-image-high-res').each(function () {
|
||||||
|
const urlMatch = $(this).attr('style').match(/url\("(.*)"\)/);
|
||||||
|
if (!urlMatch) return;
|
||||||
|
let imageUrl = urlMatch[1];
|
||||||
|
if (imageUrl[0] === '/') imageUrl = `https:${imageUrl}`;
|
||||||
|
urls.push(imageUrl);
|
||||||
|
});
|
||||||
|
return urls;
|
||||||
|
});
|
||||||
}
|
}
|
||||||
await Apify.pushData(placeDetail);
|
await Apify.pushData(placeDetail);
|
||||||
}
|
}
|
||||||
|
log.info('Finished', request.url);
|
||||||
console.log(request.url, 'Done');
|
|
||||||
},
|
},
|
||||||
handleFailedRequestFunction: async ({ request }) => {
|
handleFailedRequestFunction: async ({ request }) => {
|
||||||
// This function is called when crawling of a request failed too many time
|
// This function is called when crawling of a request failed too many time
|
||||||
|
|
@ -129,7 +179,7 @@ const setUpCrawler = (launchPuppeteerOptions, requestQueue, maxCrawledPlaces) =>
|
||||||
errors: request.errorMessages,
|
errors: request.errorMessages,
|
||||||
});
|
});
|
||||||
},
|
},
|
||||||
}));
|
});
|
||||||
};
|
};
|
||||||
|
|
||||||
module.exports = { setUpCrawler };
|
module.exports = { setUpCrawler };
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue
Block a user