mirror of
https://github.com/davidjohnbarton/crawler-google-places.git
synced 2025-12-12 16:38:45 +00:00
Updated comments and readme
This commit is contained in:
parent
797cc8ff17
commit
ddf08817be
112
README.md
112
README.md
|
|
@ -3,9 +3,10 @@ Get data from Google Places, which official [Google Maps Places API](https://dev
|
||||||
|
|
||||||
## Why?
|
## Why?
|
||||||
You can use official [Google Maps Places API](https://developers.google.com/places/web-service/search), it is better way for the most use cases.
|
You can use official [Google Maps Places API](https://developers.google.com/places/web-service/search), it is better way for the most use cases.
|
||||||
But API doesn't provide:
|
|
||||||
|
|
||||||
- Popular place times histogram
|
Unlike Google Maps Places API, you can get from crawler:
|
||||||
|
|
||||||
|
- Popular place times histogram (There is no data for that in official API)
|
||||||
- Place reviews (you can get up to 5 reviews from official API)
|
- Place reviews (you can get up to 5 reviews from official API)
|
||||||
- Place photos (you can can up to 10 photos from official API)
|
- Place photos (you can can up to 10 photos from official API)
|
||||||
|
|
||||||
|
|
@ -32,3 +33,110 @@ On this input actor searches places on this start url: https://www.google.com/ma
|
||||||
|
|
||||||
## OUTPUT
|
## OUTPUT
|
||||||
Once the actor finishes, it outputs results to actor default dataset.
|
Once the actor finishes, it outputs results to actor default dataset.
|
||||||
|
|
||||||
|
Example results item:
|
||||||
|
|
||||||
|
```text
|
||||||
|
{
|
||||||
|
"title": "Scotiabank",
|
||||||
|
"totalScore": 3.7,
|
||||||
|
"categoryName": "Bank",
|
||||||
|
"address": "201 Bishopsgate, London EC2M 3NS, UK",
|
||||||
|
"plusCode": "GWCC+75 City of London, London, UK",
|
||||||
|
"popularTimesHistogram": {
|
||||||
|
"Su": [],
|
||||||
|
"Mo": [
|
||||||
|
{
|
||||||
|
"hour": 6,
|
||||||
|
"occupancyPercent": 0
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"hour": 7,
|
||||||
|
"occupancyPercent": 0
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"hour": 8,
|
||||||
|
"occupancyPercent": 0
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"hour": 9,
|
||||||
|
"occupancyPercent": 75
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"hour": 10,
|
||||||
|
"occupancyPercent": 73
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"hour": 11,
|
||||||
|
"occupancyPercent": 60
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"hour": 12,
|
||||||
|
"occupancyPercent": 57
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"hour": 13,
|
||||||
|
"occupancyPercent": 56
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"hour": 14,
|
||||||
|
"occupancyPercent": 56
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"hour": 15,
|
||||||
|
"occupancyPercent": 57
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"hour": 16,
|
||||||
|
"occupancyPercent": 50
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"hour": 17,
|
||||||
|
"occupancyPercent": 33
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"hour": 18,
|
||||||
|
"occupancyPercent": 14
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"hour": 19,
|
||||||
|
"occupancyPercent": 4
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"hour": 20,
|
||||||
|
"occupancyPercent": 1
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"hour": 21,
|
||||||
|
"occupancyPercent": 0
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"hour": 22,
|
||||||
|
"occupancyPercent": 0
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"hour": 23,
|
||||||
|
"occupancyPercent": 0
|
||||||
|
}
|
||||||
|
],
|
||||||
|
...
|
||||||
|
},
|
||||||
|
"reviews": [
|
||||||
|
{
|
||||||
|
"name": "NELLORE BALA NAVEEN REDDY",
|
||||||
|
"text": "nice bank in london",
|
||||||
|
"stars": "5 stars",
|
||||||
|
"publishAt": "2 months ago",
|
||||||
|
"likesCount": "",
|
||||||
|
"responseFromOwnerText": ""
|
||||||
|
},
|
||||||
|
...
|
||||||
|
],
|
||||||
|
"reviewsCount": 6,
|
||||||
|
"imageUrls": [
|
||||||
|
"https://lh5.googleusercontent.com/p/AF1QipPvm-rzo7_mlLRmctQwDJV6agVGHZMUJYLinU_t=s508-k-no",
|
||||||
|
...
|
||||||
|
],
|
||||||
|
"url": "https://www.google.com/maps/place/Scotiabank/@51.5258542,-0.335595,11z/data=!4m8!1m2!2m1!1sbanks+london!3m4!1s0x48761cb181573665:0x5fce6a25f2e99723!8m2!3d51.5206306!4d-0.0795672"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
|
||||||
|
|
@ -3,4 +3,3 @@ exports.DEFAULT_TIMEOUT = 60 * 1000; // 60 sec
|
||||||
|
|
||||||
exports.LISTING_PAGINATION_KEY = 'listingState';
|
exports.LISTING_PAGINATION_KEY = 'listingState';
|
||||||
exports.MAX_PAGE_RETRIES = 5;
|
exports.MAX_PAGE_RETRIES = 5;
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -36,7 +36,7 @@ const enqueueAllUrlsFromPagination = async (page, requestQueue, paginationFrom,
|
||||||
};
|
};
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Adds all places from listing to queue
|
* Method adds places from listing to queue
|
||||||
* @param page
|
* @param page
|
||||||
* @param searchString
|
* @param searchString
|
||||||
* @param requestQueue
|
* @param requestQueue
|
||||||
|
|
@ -55,10 +55,10 @@ const enqueueAllPlaceDetails = async (page, searchString, requestQueue, maxPlace
|
||||||
try {
|
try {
|
||||||
await page.waitForSelector('h1.section-hero-header-title');
|
await page.waitForSelector('h1.section-hero-header-title');
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
// It can happen, if there are listing, not just detail page
|
// It can happen if there is list of details.
|
||||||
}
|
}
|
||||||
|
|
||||||
// In case there is no listing, put just detail page to queue
|
// In case there is not list of details, it enqueues just detail page
|
||||||
const maybeDetailPlace = await page.$('h1.section-hero-header-title');
|
const maybeDetailPlace = await page.$('h1.section-hero-header-title');
|
||||||
if (maybeDetailPlace) {
|
if (maybeDetailPlace) {
|
||||||
const url = page.url();
|
const url = page.url();
|
||||||
|
|
@ -66,7 +66,7 @@ const enqueueAllPlaceDetails = async (page, searchString, requestQueue, maxPlace
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
// In case there is listing, go through all details, limits with maxPlacesPerCrawl
|
// In case there is a list of details, it goes through details, limits by maxPlacesPerCrawl
|
||||||
const nextButtonSelector = '[jsaction="pane.paginationSection.nextPage"]';
|
const nextButtonSelector = '[jsaction="pane.paginationSection.nextPage"]';
|
||||||
let isFinished;
|
let isFinished;
|
||||||
while (true) {
|
while (true) {
|
||||||
|
|
|
||||||
|
|
@ -11,7 +11,7 @@ const { enqueueAllPlaceDetails } = require('./enqueue_places_crawler');
|
||||||
* @param page
|
* @param page
|
||||||
*/
|
*/
|
||||||
const extractPlaceDetail = async (page) => {
|
const extractPlaceDetail = async (page) => {
|
||||||
// Extracts basic information
|
// Extract basic information
|
||||||
const titleSel = 'h1.section-hero-header-title';
|
const titleSel = 'h1.section-hero-header-title';
|
||||||
await page.waitForSelector(titleSel, { timeout: DEFAULT_TIMEOUT });
|
await page.waitForSelector(titleSel, { timeout: DEFAULT_TIMEOUT });
|
||||||
const detail = await page.evaluate(() => {
|
const detail = await page.evaluate(() => {
|
||||||
|
|
@ -24,7 +24,7 @@ const extractPlaceDetail = async (page) => {
|
||||||
};
|
};
|
||||||
});
|
});
|
||||||
|
|
||||||
// Extracty histogram for popular times
|
// Extract histogram for popular times
|
||||||
const histogramSel = '.section-popular-times';
|
const histogramSel = '.section-popular-times';
|
||||||
if (await page.$(histogramSel)) {
|
if (await page.$(histogramSel)) {
|
||||||
detail.popularTimesHistogram = await page.evaluate(() => {
|
detail.popularTimesHistogram = await page.evaluate(() => {
|
||||||
|
|
@ -59,7 +59,7 @@ const extractPlaceDetail = async (page) => {
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
// Extracts reviews
|
// Extract reviews
|
||||||
detail.reviews = [];
|
detail.reviews = [];
|
||||||
const reviewsButtonSel = 'button[jsaction="pane.reviewChart.moreReviews"]';
|
const reviewsButtonSel = 'button[jsaction="pane.reviewChart.moreReviews"]';
|
||||||
if (detail.totalScore) {
|
if (detail.totalScore) {
|
||||||
|
|
@ -117,7 +117,7 @@ const extractPlaceDetail = async (page) => {
|
||||||
await page.click('button.section-header-back-button');
|
await page.click('button.section-header-back-button');
|
||||||
}
|
}
|
||||||
|
|
||||||
// Extracts place images
|
// Extract place images
|
||||||
await page.waitForSelector(titleSel, { timeout: DEFAULT_TIMEOUT });
|
await page.waitForSelector(titleSel, { timeout: DEFAULT_TIMEOUT });
|
||||||
const imagesButtonSel = '.section-image-pack-image-container';
|
const imagesButtonSel = '.section-image-pack-image-container';
|
||||||
const imagesButton = await page.$(imagesButtonSel);
|
const imagesButton = await page.$(imagesButtonSel);
|
||||||
|
|
@ -153,8 +153,8 @@ const setUpCrawler = (launchPuppeteerOptions, requestQueue, maxCrawledPlaces) =>
|
||||||
requestQueue,
|
requestQueue,
|
||||||
maxRequestRetries: MAX_PAGE_RETRIES,
|
maxRequestRetries: MAX_PAGE_RETRIES,
|
||||||
retireInstanceAfterRequestCount: 10,
|
retireInstanceAfterRequestCount: 10,
|
||||||
handlePageTimeoutSecs: 15 * 60, // 15 min because startUrl enqueueing
|
handlePageTimeoutSecs: 15 * 60, // long timeout, because of startUrl enqueueing
|
||||||
maxOpenPagesPerInstance: 1, // Because startUrl enqueueing crashes if we mixed tabs with details scraping
|
maxOpenPagesPerInstance: 1, // Because of startUrl enqueueing crashes if we mix tabs with another scraping
|
||||||
};
|
};
|
||||||
if (maxCrawledPlaces) {
|
if (maxCrawledPlaces) {
|
||||||
crawlerOpts.maxRequestsPerCrawl = maxCrawledPlaces + 1; // The first one is startUrl
|
crawlerOpts.maxRequestsPerCrawl = maxCrawledPlaces + 1; // The first one is startUrl
|
||||||
|
|
|
||||||
|
|
@ -2,13 +2,13 @@ const Apify = require('apify');
|
||||||
const rp = require('request-promise');
|
const rp = require('request-promise');
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Check if user some of user proxies work for Google Maps
|
* Check if some of proxies work for Google Maps
|
||||||
* @param proxyConfig
|
* @param proxyConfig
|
||||||
*/
|
*/
|
||||||
const proxyCheck = async (proxyConfig) => {
|
const proxyCheck = async (proxyConfig) => {
|
||||||
const proxy = Apify.getApifyProxyUrl({ groups: proxyConfig.apifyProxyGroups });
|
const proxy = Apify.getApifyProxyUrl({ groups: proxyConfig.apifyProxyGroups });
|
||||||
|
|
||||||
// Check if user used Apify Proxy
|
// Check if user uses Apify Proxy
|
||||||
if (!proxyConfig.useApifyProxy) {
|
if (!proxyConfig.useApifyProxy) {
|
||||||
return {
|
return {
|
||||||
isPass: false,
|
isPass: false,
|
||||||
|
|
@ -44,4 +44,3 @@ const proxyCheck = async (proxyConfig) => {
|
||||||
module.exports = {
|
module.exports = {
|
||||||
proxyCheck,
|
proxyCheck,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue
Block a user