Updated comments and readme

This commit is contained in:
drobnikj 2019-01-10 14:37:45 +01:00
parent 797cc8ff17
commit ddf08817be
5 changed files with 122 additions and 16 deletions

112
README.md
View File

@ -3,9 +3,10 @@ Get data from Google Places, which official [Google Maps Places API](https://dev
## Why? ## Why?
You can use official [Google Maps Places API](https://developers.google.com/places/web-service/search), it is better way for the most use cases. You can use official [Google Maps Places API](https://developers.google.com/places/web-service/search), it is better way for the most use cases.
But API doesn't provide:
- Popular place times histogram Unlike Google Maps Places API, you can get from crawler:
- Popular place times histogram (There is no data for that in official API)
- Place reviews (you can get up to 5 reviews from official API) - Place reviews (you can get up to 5 reviews from official API)
- Place photos (you can can up to 10 photos from official API) - Place photos (you can can up to 10 photos from official API)
@ -32,3 +33,110 @@ On this input actor searches places on this start url: https://www.google.com/ma
## OUTPUT ## OUTPUT
Once the actor finishes, it outputs results to actor default dataset. Once the actor finishes, it outputs results to actor default dataset.
Example results item:
```text
{
"title": "Scotiabank",
"totalScore": 3.7,
"categoryName": "Bank",
"address": "201 Bishopsgate, London EC2M 3NS, UK",
"plusCode": "GWCC+75 City of London, London, UK",
"popularTimesHistogram": {
"Su": [],
"Mo": [
{
"hour": 6,
"occupancyPercent": 0
},
{
"hour": 7,
"occupancyPercent": 0
},
{
"hour": 8,
"occupancyPercent": 0
},
{
"hour": 9,
"occupancyPercent": 75
},
{
"hour": 10,
"occupancyPercent": 73
},
{
"hour": 11,
"occupancyPercent": 60
},
{
"hour": 12,
"occupancyPercent": 57
},
{
"hour": 13,
"occupancyPercent": 56
},
{
"hour": 14,
"occupancyPercent": 56
},
{
"hour": 15,
"occupancyPercent": 57
},
{
"hour": 16,
"occupancyPercent": 50
},
{
"hour": 17,
"occupancyPercent": 33
},
{
"hour": 18,
"occupancyPercent": 14
},
{
"hour": 19,
"occupancyPercent": 4
},
{
"hour": 20,
"occupancyPercent": 1
},
{
"hour": 21,
"occupancyPercent": 0
},
{
"hour": 22,
"occupancyPercent": 0
},
{
"hour": 23,
"occupancyPercent": 0
}
],
...
},
"reviews": [
{
"name": "NELLORE BALA NAVEEN REDDY",
"text": "nice bank in london",
"stars": "5 stars",
"publishAt": "2 months ago",
"likesCount": "",
"responseFromOwnerText": ""
},
...
],
"reviewsCount": 6,
"imageUrls": [
"https://lh5.googleusercontent.com/p/AF1QipPvm-rzo7_mlLRmctQwDJV6agVGHZMUJYLinU_t=s508-k-no",
...
],
"url": "https://www.google.com/maps/place/Scotiabank/@51.5258542,-0.335595,11z/data=!4m8!1m2!2m1!1sbanks+london!3m4!1s0x48761cb181573665:0x5fce6a25f2e99723!8m2!3d51.5206306!4d-0.0795672"
}
```

View File

@ -3,4 +3,3 @@ exports.DEFAULT_TIMEOUT = 60 * 1000; // 60 sec
exports.LISTING_PAGINATION_KEY = 'listingState'; exports.LISTING_PAGINATION_KEY = 'listingState';
exports.MAX_PAGE_RETRIES = 5; exports.MAX_PAGE_RETRIES = 5;

View File

@ -36,7 +36,7 @@ const enqueueAllUrlsFromPagination = async (page, requestQueue, paginationFrom,
}; };
/** /**
* Adds all places from listing to queue * Method adds places from listing to queue
* @param page * @param page
* @param searchString * @param searchString
* @param requestQueue * @param requestQueue
@ -55,10 +55,10 @@ const enqueueAllPlaceDetails = async (page, searchString, requestQueue, maxPlace
try { try {
await page.waitForSelector('h1.section-hero-header-title'); await page.waitForSelector('h1.section-hero-header-title');
} catch (e) { } catch (e) {
// It can happen, if there are listing, not just detail page // It can happen if there is list of details.
} }
// In case there is no listing, put just detail page to queue // In case there is not list of details, it enqueues just detail page
const maybeDetailPlace = await page.$('h1.section-hero-header-title'); const maybeDetailPlace = await page.$('h1.section-hero-header-title');
if (maybeDetailPlace) { if (maybeDetailPlace) {
const url = page.url(); const url = page.url();
@ -66,7 +66,7 @@ const enqueueAllPlaceDetails = async (page, searchString, requestQueue, maxPlace
return; return;
} }
// In case there is listing, go through all details, limits with maxPlacesPerCrawl // In case there is a list of details, it goes through details, limits by maxPlacesPerCrawl
const nextButtonSelector = '[jsaction="pane.paginationSection.nextPage"]'; const nextButtonSelector = '[jsaction="pane.paginationSection.nextPage"]';
let isFinished; let isFinished;
while (true) { while (true) {

View File

@ -11,7 +11,7 @@ const { enqueueAllPlaceDetails } = require('./enqueue_places_crawler');
* @param page * @param page
*/ */
const extractPlaceDetail = async (page) => { const extractPlaceDetail = async (page) => {
// Extracts basic information // Extract basic information
const titleSel = 'h1.section-hero-header-title'; const titleSel = 'h1.section-hero-header-title';
await page.waitForSelector(titleSel, { timeout: DEFAULT_TIMEOUT }); await page.waitForSelector(titleSel, { timeout: DEFAULT_TIMEOUT });
const detail = await page.evaluate(() => { const detail = await page.evaluate(() => {
@ -24,7 +24,7 @@ const extractPlaceDetail = async (page) => {
}; };
}); });
// Extracty histogram for popular times // Extract histogram for popular times
const histogramSel = '.section-popular-times'; const histogramSel = '.section-popular-times';
if (await page.$(histogramSel)) { if (await page.$(histogramSel)) {
detail.popularTimesHistogram = await page.evaluate(() => { detail.popularTimesHistogram = await page.evaluate(() => {
@ -59,7 +59,7 @@ const extractPlaceDetail = async (page) => {
}); });
} }
// Extracts reviews // Extract reviews
detail.reviews = []; detail.reviews = [];
const reviewsButtonSel = 'button[jsaction="pane.reviewChart.moreReviews"]'; const reviewsButtonSel = 'button[jsaction="pane.reviewChart.moreReviews"]';
if (detail.totalScore) { if (detail.totalScore) {
@ -117,7 +117,7 @@ const extractPlaceDetail = async (page) => {
await page.click('button.section-header-back-button'); await page.click('button.section-header-back-button');
} }
// Extracts place images // Extract place images
await page.waitForSelector(titleSel, { timeout: DEFAULT_TIMEOUT }); await page.waitForSelector(titleSel, { timeout: DEFAULT_TIMEOUT });
const imagesButtonSel = '.section-image-pack-image-container'; const imagesButtonSel = '.section-image-pack-image-container';
const imagesButton = await page.$(imagesButtonSel); const imagesButton = await page.$(imagesButtonSel);
@ -153,8 +153,8 @@ const setUpCrawler = (launchPuppeteerOptions, requestQueue, maxCrawledPlaces) =>
requestQueue, requestQueue,
maxRequestRetries: MAX_PAGE_RETRIES, maxRequestRetries: MAX_PAGE_RETRIES,
retireInstanceAfterRequestCount: 10, retireInstanceAfterRequestCount: 10,
handlePageTimeoutSecs: 15 * 60, // 15 min because startUrl enqueueing handlePageTimeoutSecs: 15 * 60, // long timeout, because of startUrl enqueueing
maxOpenPagesPerInstance: 1, // Because startUrl enqueueing crashes if we mixed tabs with details scraping maxOpenPagesPerInstance: 1, // Because of startUrl enqueueing crashes if we mix tabs with another scraping
}; };
if (maxCrawledPlaces) { if (maxCrawledPlaces) {
crawlerOpts.maxRequestsPerCrawl = maxCrawledPlaces + 1; // The first one is startUrl crawlerOpts.maxRequestsPerCrawl = maxCrawledPlaces + 1; // The first one is startUrl

View File

@ -2,13 +2,13 @@ const Apify = require('apify');
const rp = require('request-promise'); const rp = require('request-promise');
/** /**
* Check if user some of user proxies work for Google Maps * Check if some of proxies work for Google Maps
* @param proxyConfig * @param proxyConfig
*/ */
const proxyCheck = async (proxyConfig) => { const proxyCheck = async (proxyConfig) => {
const proxy = Apify.getApifyProxyUrl({ groups: proxyConfig.apifyProxyGroups }); const proxy = Apify.getApifyProxyUrl({ groups: proxyConfig.apifyProxyGroups });
// Check if user used Apify Proxy // Check if user uses Apify Proxy
if (!proxyConfig.useApifyProxy) { if (!proxyConfig.useApifyProxy) {
return { return {
isPass: false, isPass: false,
@ -44,4 +44,3 @@ const proxyCheck = async (proxyConfig) => {
module.exports = { module.exports = {
proxyCheck, proxyCheck,
}; };