From ddf08817be7f070bdcea80a61f72c06f8d0491a4 Mon Sep 17 00:00:00 2001 From: drobnikj Date: Thu, 10 Jan 2019 14:37:45 +0100 Subject: [PATCH] Updated comments and readme --- README.md | 112 +++++++++++++++++++++++++++++++++- src/consts.js | 1 - src/enqueue_places_crawler.js | 8 +-- src/places_crawler.js | 12 ++-- src/proxy_check.js | 5 +- 5 files changed, 122 insertions(+), 16 deletions(-) diff --git a/README.md b/README.md index 130fd0e..d6fbc97 100644 --- a/README.md +++ b/README.md @@ -3,9 +3,10 @@ Get data from Google Places, which official [Google Maps Places API](https://dev ## Why? You can use official [Google Maps Places API](https://developers.google.com/places/web-service/search), it is better way for the most use cases. -But API doesn't provide: -- Popular place times histogram +Unlike Google Maps Places API, you can get from crawler: + +- Popular place times histogram (There is no data for that in official API) - Place reviews (you can get up to 5 reviews from official API) - Place photos (you can can up to 10 photos from official API) @@ -32,3 +33,110 @@ On this input actor searches places on this start url: https://www.google.com/ma ## OUTPUT Once the actor finishes, it outputs results to actor default dataset. + +Example results item: + +```text +{ + "title": "Scotiabank", + "totalScore": 3.7, + "categoryName": "Bank", + "address": "201 Bishopsgate, London EC2M 3NS, UK", + "plusCode": "GWCC+75 City of London, London, UK", + "popularTimesHistogram": { + "Su": [], + "Mo": [ + { + "hour": 6, + "occupancyPercent": 0 + }, + { + "hour": 7, + "occupancyPercent": 0 + }, + { + "hour": 8, + "occupancyPercent": 0 + }, + { + "hour": 9, + "occupancyPercent": 75 + }, + { + "hour": 10, + "occupancyPercent": 73 + }, + { + "hour": 11, + "occupancyPercent": 60 + }, + { + "hour": 12, + "occupancyPercent": 57 + }, + { + "hour": 13, + "occupancyPercent": 56 + }, + { + "hour": 14, + "occupancyPercent": 56 + }, + { + "hour": 15, + "occupancyPercent": 57 + }, + { + "hour": 16, + "occupancyPercent": 50 + }, + { + "hour": 17, + "occupancyPercent": 33 + }, + { + "hour": 18, + "occupancyPercent": 14 + }, + { + "hour": 19, + "occupancyPercent": 4 + }, + { + "hour": 20, + "occupancyPercent": 1 + }, + { + "hour": 21, + "occupancyPercent": 0 + }, + { + "hour": 22, + "occupancyPercent": 0 + }, + { + "hour": 23, + "occupancyPercent": 0 + } + ], + ... + }, + "reviews": [ + { + "name": "NELLORE BALA NAVEEN REDDY", + "text": "nice bank in london", + "stars": "5 stars", + "publishAt": "2 months ago", + "likesCount": "", + "responseFromOwnerText": "" + }, + ... + ], + "reviewsCount": 6, + "imageUrls": [ + "https://lh5.googleusercontent.com/p/AF1QipPvm-rzo7_mlLRmctQwDJV6agVGHZMUJYLinU_t=s508-k-no", + ... + ], + "url": "https://www.google.com/maps/place/Scotiabank/@51.5258542,-0.335595,11z/data=!4m8!1m2!2m1!1sbanks+london!3m4!1s0x48761cb181573665:0x5fce6a25f2e99723!8m2!3d51.5206306!4d-0.0795672" +} +``` diff --git a/src/consts.js b/src/consts.js index 08bbdf6..3f7ec94 100644 --- a/src/consts.js +++ b/src/consts.js @@ -3,4 +3,3 @@ exports.DEFAULT_TIMEOUT = 60 * 1000; // 60 sec exports.LISTING_PAGINATION_KEY = 'listingState'; exports.MAX_PAGE_RETRIES = 5; - diff --git a/src/enqueue_places_crawler.js b/src/enqueue_places_crawler.js index 3f846d0..cea7929 100644 --- a/src/enqueue_places_crawler.js +++ b/src/enqueue_places_crawler.js @@ -36,7 +36,7 @@ const enqueueAllUrlsFromPagination = async (page, requestQueue, paginationFrom, }; /** - * Adds all places from listing to queue + * Method adds places from listing to queue * @param page * @param searchString * @param requestQueue @@ -55,10 +55,10 @@ const enqueueAllPlaceDetails = async (page, searchString, requestQueue, maxPlace try { await page.waitForSelector('h1.section-hero-header-title'); } catch (e) { - // It can happen, if there are listing, not just detail page + // It can happen if there is list of details. } - // In case there is no listing, put just detail page to queue + // In case there is not list of details, it enqueues just detail page const maybeDetailPlace = await page.$('h1.section-hero-header-title'); if (maybeDetailPlace) { const url = page.url(); @@ -66,7 +66,7 @@ const enqueueAllPlaceDetails = async (page, searchString, requestQueue, maxPlace return; } - // In case there is listing, go through all details, limits with maxPlacesPerCrawl + // In case there is a list of details, it goes through details, limits by maxPlacesPerCrawl const nextButtonSelector = '[jsaction="pane.paginationSection.nextPage"]'; let isFinished; while (true) { diff --git a/src/places_crawler.js b/src/places_crawler.js index a305dde..b76c537 100644 --- a/src/places_crawler.js +++ b/src/places_crawler.js @@ -11,7 +11,7 @@ const { enqueueAllPlaceDetails } = require('./enqueue_places_crawler'); * @param page */ const extractPlaceDetail = async (page) => { - // Extracts basic information + // Extract basic information const titleSel = 'h1.section-hero-header-title'; await page.waitForSelector(titleSel, { timeout: DEFAULT_TIMEOUT }); const detail = await page.evaluate(() => { @@ -24,7 +24,7 @@ const extractPlaceDetail = async (page) => { }; }); - // Extracty histogram for popular times + // Extract histogram for popular times const histogramSel = '.section-popular-times'; if (await page.$(histogramSel)) { detail.popularTimesHistogram = await page.evaluate(() => { @@ -59,7 +59,7 @@ const extractPlaceDetail = async (page) => { }); } - // Extracts reviews + // Extract reviews detail.reviews = []; const reviewsButtonSel = 'button[jsaction="pane.reviewChart.moreReviews"]'; if (detail.totalScore) { @@ -117,7 +117,7 @@ const extractPlaceDetail = async (page) => { await page.click('button.section-header-back-button'); } - // Extracts place images + // Extract place images await page.waitForSelector(titleSel, { timeout: DEFAULT_TIMEOUT }); const imagesButtonSel = '.section-image-pack-image-container'; const imagesButton = await page.$(imagesButtonSel); @@ -153,8 +153,8 @@ const setUpCrawler = (launchPuppeteerOptions, requestQueue, maxCrawledPlaces) => requestQueue, maxRequestRetries: MAX_PAGE_RETRIES, retireInstanceAfterRequestCount: 10, - handlePageTimeoutSecs: 15 * 60, // 15 min because startUrl enqueueing - maxOpenPagesPerInstance: 1, // Because startUrl enqueueing crashes if we mixed tabs with details scraping + handlePageTimeoutSecs: 15 * 60, // long timeout, because of startUrl enqueueing + maxOpenPagesPerInstance: 1, // Because of startUrl enqueueing crashes if we mix tabs with another scraping }; if (maxCrawledPlaces) { crawlerOpts.maxRequestsPerCrawl = maxCrawledPlaces + 1; // The first one is startUrl diff --git a/src/proxy_check.js b/src/proxy_check.js index 09a4d65..a02f678 100644 --- a/src/proxy_check.js +++ b/src/proxy_check.js @@ -2,13 +2,13 @@ const Apify = require('apify'); const rp = require('request-promise'); /** - * Check if user some of user proxies work for Google Maps + * Check if some of proxies work for Google Maps * @param proxyConfig */ const proxyCheck = async (proxyConfig) => { const proxy = Apify.getApifyProxyUrl({ groups: proxyConfig.apifyProxyGroups }); - // Check if user used Apify Proxy + // Check if user uses Apify Proxy if (!proxyConfig.useApifyProxy) { return { isPass: false, @@ -44,4 +44,3 @@ const proxyCheck = async (proxyConfig) => { module.exports = { proxyCheck, }; -