mirror of
https://github.com/davidjohnbarton/crawler-google-places.git
synced 2025-12-12 16:38:45 +00:00
Fixed macCrawledPlaces, infiniteScroll
This commit is contained in:
parent
ce135fb8e8
commit
9918348e29
|
|
@ -28,7 +28,7 @@ const enqueueAllUrlsFromPagination = async (page, requestQueue, paginationFrom,
|
||||||
log.info(`Added to queue ${url}`);
|
log.info(`Added to queue ${url}`);
|
||||||
if (maxPlacesPerCrawl && paginationFrom + resultIndex + 1 > maxPlacesPerCrawl) {
|
if (maxPlacesPerCrawl && paginationFrom + resultIndex + 1 > maxPlacesPerCrawl) {
|
||||||
log.info(`Reach max places per crawl ${maxPlacesPerCrawl}, stopped enqueuing new places.`);
|
log.info(`Reach max places per crawl ${maxPlacesPerCrawl}, stopped enqueuing new places.`);
|
||||||
break;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
await page.click('.section-back-to-list-button');
|
await page.click('.section-back-to-list-button');
|
||||||
|
|
@ -68,6 +68,7 @@ const enqueueAllPlaceDetails = async (page, searchString, requestQueue, maxPlace
|
||||||
|
|
||||||
// In case there is listing, go through all details, limits with maxPlacesPerCrawl
|
// In case there is listing, go through all details, limits with maxPlacesPerCrawl
|
||||||
const nextButtonSelector = '[jsaction="pane.paginationSection.nextPage"]';
|
const nextButtonSelector = '[jsaction="pane.paginationSection.nextPage"]';
|
||||||
|
let isFinished;
|
||||||
while (true) {
|
while (true) {
|
||||||
await page.waitForSelector(nextButtonSelector, { timeout: DEFAULT_TIMEOUT });
|
await page.waitForSelector(nextButtonSelector, { timeout: DEFAULT_TIMEOUT });
|
||||||
const paginationText = await page.$eval('.n7lv7yjyC35__right', (el) => el.innerText);
|
const paginationText = await page.$eval('.n7lv7yjyC35__right', (el) => el.innerText);
|
||||||
|
|
@ -78,17 +79,17 @@ const enqueueAllPlaceDetails = async (page, searchString, requestQueue, maxPlace
|
||||||
log.debug(`Skiped pagination ${from} - ${to}, already done!`);
|
log.debug(`Skiped pagination ${from} - ${to}, already done!`);
|
||||||
} else {
|
} else {
|
||||||
log.debug(`Added links from pagination ${from} - ${to}`);
|
log.debug(`Added links from pagination ${from} - ${to}`);
|
||||||
await enqueueAllUrlsFromPagination(page, requestQueue, from, maxPlacesPerCrawl);
|
isFinished = await enqueueAllUrlsFromPagination(page, requestQueue, from, maxPlacesPerCrawl);
|
||||||
listingPagination.from = from;
|
listingPagination.from = from;
|
||||||
listingPagination.to = to;
|
listingPagination.to = to;
|
||||||
await Apify.setValue(LISTING_PAGINATION_KEY, listingPagination);
|
await Apify.setValue(LISTING_PAGINATION_KEY, listingPagination);
|
||||||
}
|
}
|
||||||
await page.waitForSelector(nextButtonSelector, { timeout: DEFAULT_TIMEOUT });
|
if (!isFinished) await page.waitForSelector(nextButtonSelector, { timeout: DEFAULT_TIMEOUT });
|
||||||
const isNextPaginationDisabled = await page.evaluate((nextButtonSelector) => {
|
const isNextPaginationDisabled = await page.evaluate((nextButtonSelector) => {
|
||||||
return !!$(nextButtonSelector).attr('disabled');
|
return !!$(nextButtonSelector).attr('disabled');
|
||||||
}, nextButtonSelector);
|
}, nextButtonSelector);
|
||||||
const noResultsEl = await page.$('.section-no-result-title');
|
const noResultsEl = await page.$('.section-no-result-title');
|
||||||
if (isNextPaginationDisabled || noResultsEl || (maxPlacesPerCrawl && maxPlacesPerCrawl < to)) {
|
if (isNextPaginationDisabled || noResultsEl || (maxPlacesPerCrawl && maxPlacesPerCrawl <= to) || isFinished) {
|
||||||
break;
|
break;
|
||||||
} else {
|
} else {
|
||||||
// NOTE: puppeteer API click() didn't work :|
|
// NOTE: puppeteer API click() didn't work :|
|
||||||
|
|
|
||||||
|
|
@ -119,6 +119,7 @@ module.exports = async (page, maxHeight, elementToScroll = 'body') => {
|
||||||
}
|
}
|
||||||
await sleep(defaultScrollDelay);
|
await sleep(defaultScrollDelay);
|
||||||
}
|
}
|
||||||
|
page.removeAllListeners('request');
|
||||||
logInfo(`Infinite scroll finished (${stringifyScrollInfo(scrollInfo)} resourcesStats=${JSON.stringify(resourcesStats)})`);
|
logInfo(`Infinite scroll finished (${stringifyScrollInfo(scrollInfo)} resourcesStats=${JSON.stringify(resourcesStats)})`);
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
logError('An exception thrown in infiniteScroll()', err);
|
logError('An exception thrown in infiniteScroll()', err);
|
||||||
|
|
|
||||||
|
|
@ -1,8 +0,0 @@
|
||||||
const testInput = {
|
|
||||||
searchString: 'pubs new york',
|
|
||||||
maxCrawledPlaces: 2,
|
|
||||||
};
|
|
||||||
|
|
||||||
module.exports = {
|
|
||||||
testInput,
|
|
||||||
};
|
|
||||||
|
|
@ -1,32 +0,0 @@
|
||||||
const { expect } = require('chai');
|
|
||||||
const fs = require('fs');
|
|
||||||
const { spawnSync } = require('child_process');
|
|
||||||
const path = require('path');
|
|
||||||
const writeJson = require('write-json');
|
|
||||||
const { testInput } = require('./config');
|
|
||||||
|
|
||||||
const TEST_STORAGE_FOLDER = 'apify_storage_test';
|
|
||||||
const TEST_KV_STORAGE_FOLDER = 'key_value_stores';
|
|
||||||
|
|
||||||
describe('Crawler', () => {
|
|
||||||
before(() => {
|
|
||||||
spawnSync('cd ..');
|
|
||||||
fs.mkdirSync(TEST_STORAGE_FOLDER);
|
|
||||||
fs.mkdirSync(path.join(TEST_STORAGE_FOLDER, TEST_KV_STORAGE_FOLDER));
|
|
||||||
|
|
||||||
const defaultKvsDir = path.join(TEST_STORAGE_FOLDER, TEST_KV_STORAGE_FOLDER, 'default');
|
|
||||||
fs.mkdirSync(defaultKvsDir);
|
|
||||||
writeJson.sync(path.join(defaultKvsDir, 'INPUT.json'), testInput);
|
|
||||||
});
|
|
||||||
it('should work on local', () => {
|
|
||||||
spawnSync('npm', ['run', 'start'], {
|
|
||||||
env: {
|
|
||||||
APIFY_LOCAL_STORAGE_DIR: `./${TEST_STORAGE_FOLDER}`,
|
|
||||||
},
|
|
||||||
});
|
|
||||||
expect('1').to.be.equal('1');
|
|
||||||
});
|
|
||||||
after(() => {
|
|
||||||
spawnSync('cd test');
|
|
||||||
});
|
|
||||||
});
|
|
||||||
Loading…
Reference in New Issue
Block a user