crawler-google-places/src/enqueue_places_crawler.js

102 lines
4.7 KiB
JavaScript

const Apify = require('apify');
const { sleep } = Apify.utils;
const { injectJQuery } = Apify.utils.puppeteer;
const { MAX_PAGE_RETRIES, DEFAULT_TIMEOUT, LISTING_PAGINATION_KEY } = require('./consts');
const waitForGoogleMapLoader = (page) => page.waitFor(() => !document.querySelector('#searchbox').classList.contains('loading'), { timeout: DEFAULT_TIMEOUT });
const enqueueAllUrlsFromPagination = async (page, requestQueue) => {
let results = await page.$$('.section-result');
const resultsCount = results.length;
for (let resultIndex = 0; resultIndex < resultsCount; resultIndex++) {
// Need to get results again, pupptr lost context..
await page.waitForSelector('.searchbox', { timeout: DEFAULT_TIMEOUT });
await waitForGoogleMapLoader(page);
await page.waitFor((resultIndex) => {
return document.querySelectorAll('.section-result h3').length >= resultIndex + 1;
}, { timeout: DEFAULT_TIMEOUT }, resultIndex);
results = await page.$$('.section-result');
const link = await results[resultIndex].$('h3');
await link.click();
await waitForGoogleMapLoader(page);
await page.waitForSelector('.section-back-to-list-button', { timeout: DEFAULT_TIMEOUT });
const url = page.url();
await requestQueue.addRequest({ url, userData: { label: 'detail' } });
console.log(`Added to queue ${url}`);
await page.click('.section-back-to-list-button');
}
};
/**
* Crawler add all place detail from listing to queue
* @param startUrl
* @param searchString
* @param launchPuppeteerOptions
* @param requestQueue
* @param listingPagination
* @param retries
*/
const enqueueAllPlaceDetailsCrawler = async (startUrl, searchString, launchPuppeteerOptions, requestQueue, listingPagination, retries = 0) => {
let browser;
try {
browser = await Apify.launchPuppeteer(launchPuppeteerOptions);
const page = await browser.newPage();
await page._client.send('Emulation.clearDeviceMetricsOverride');
await page.goto(startUrl);
await injectJQuery(page);
await page.type('#searchboxinput', searchString);
await sleep(5000);
await page.click('#searchbox-searchbutton');
await sleep(5000);
await waitForGoogleMapLoader(page);
// In case there is no listing, put just detail page to queue
const maybeDetailPlace = await page.$('h1.section-hero-header-title');
if (maybeDetailPlace) {
const url = page.url();
await requestQueue.addRequest({ url, userData: { label: 'detail' } });
return;
}
const nextButtonSelector = '#section-pagination-button-next';
while (true) {
await page.waitForSelector(nextButtonSelector, { timeout: DEFAULT_TIMEOUT });
const paginationText = await page.$eval('.section-pagination-right', (el) => el.innerText);
const [fromString, toString] = paginationText.match(/\d+/g);
const from = parseInt(fromString);
const to = parseInt(toString);
if (listingPagination.from && from <= listingPagination.from) {
console.log(`Skiped pagination ${from} - ${to}, already done!`);
} else {
console.log(`Added links from pagination ${from} - ${to}`);
await enqueueAllUrlsFromPagination(page, requestQueue);
listingPagination = { from, to };
await Apify.setValue(LISTING_PAGINATION_KEY, listingPagination);
}
await page.waitForSelector(nextButtonSelector, { timeout: DEFAULT_TIMEOUT });
const isNextPaginationDisabled = await page.evaluate((nextButtonSelector) => {
return !!$(nextButtonSelector).attr('disabled');
}, nextButtonSelector);
const noResultsEl = await page.$('.section-no-result-title');
if (isNextPaginationDisabled || noResultsEl) {
break;
} else {
// NOTE: puppeteer API click() didn't work :(
await page.evaluate((sel) => $(sel).click(), nextButtonSelector);
await waitForGoogleMapLoader(page);
}
}
} catch (err) {
if (retries < MAX_PAGE_RETRIES) {
++retries;
console.log(`Retiring enqueueAllPlaceDetails for ${retries} time, error:`);
console.error(err);
await browser.close();
await enqueueAllPlaceDetailsCrawler(startUrl, searchString, launchPuppeteerOptions, requestQueue, listingPagination, ++retries);
}
throw err;
} finally {
if (browser) await browser.close();
}
};
module.exports = { run: enqueueAllPlaceDetailsCrawler };