mirror of
https://github.com/davidjohnbarton/crawler-google-places.git
synced 2025-12-12 16:38:45 +00:00
Fixed macCrawledPlaces, infiniteScroll
This commit is contained in:
parent
ce135fb8e8
commit
9918348e29
|
|
@ -28,7 +28,7 @@ const enqueueAllUrlsFromPagination = async (page, requestQueue, paginationFrom,
|
|||
log.info(`Added to queue ${url}`);
|
||||
if (maxPlacesPerCrawl && paginationFrom + resultIndex + 1 > maxPlacesPerCrawl) {
|
||||
log.info(`Reach max places per crawl ${maxPlacesPerCrawl}, stopped enqueuing new places.`);
|
||||
break;
|
||||
return true;
|
||||
}
|
||||
|
||||
await page.click('.section-back-to-list-button');
|
||||
|
|
@ -68,6 +68,7 @@ const enqueueAllPlaceDetails = async (page, searchString, requestQueue, maxPlace
|
|||
|
||||
// In case there is listing, go through all details, limits with maxPlacesPerCrawl
|
||||
const nextButtonSelector = '[jsaction="pane.paginationSection.nextPage"]';
|
||||
let isFinished;
|
||||
while (true) {
|
||||
await page.waitForSelector(nextButtonSelector, { timeout: DEFAULT_TIMEOUT });
|
||||
const paginationText = await page.$eval('.n7lv7yjyC35__right', (el) => el.innerText);
|
||||
|
|
@ -78,17 +79,17 @@ const enqueueAllPlaceDetails = async (page, searchString, requestQueue, maxPlace
|
|||
log.debug(`Skiped pagination ${from} - ${to}, already done!`);
|
||||
} else {
|
||||
log.debug(`Added links from pagination ${from} - ${to}`);
|
||||
await enqueueAllUrlsFromPagination(page, requestQueue, from, maxPlacesPerCrawl);
|
||||
isFinished = await enqueueAllUrlsFromPagination(page, requestQueue, from, maxPlacesPerCrawl);
|
||||
listingPagination.from = from;
|
||||
listingPagination.to = to;
|
||||
await Apify.setValue(LISTING_PAGINATION_KEY, listingPagination);
|
||||
}
|
||||
await page.waitForSelector(nextButtonSelector, { timeout: DEFAULT_TIMEOUT });
|
||||
if (!isFinished) await page.waitForSelector(nextButtonSelector, { timeout: DEFAULT_TIMEOUT });
|
||||
const isNextPaginationDisabled = await page.evaluate((nextButtonSelector) => {
|
||||
return !!$(nextButtonSelector).attr('disabled');
|
||||
}, nextButtonSelector);
|
||||
const noResultsEl = await page.$('.section-no-result-title');
|
||||
if (isNextPaginationDisabled || noResultsEl || (maxPlacesPerCrawl && maxPlacesPerCrawl < to)) {
|
||||
if (isNextPaginationDisabled || noResultsEl || (maxPlacesPerCrawl && maxPlacesPerCrawl <= to) || isFinished) {
|
||||
break;
|
||||
} else {
|
||||
// NOTE: puppeteer API click() didn't work :|
|
||||
|
|
|
|||
|
|
@ -119,6 +119,7 @@ module.exports = async (page, maxHeight, elementToScroll = 'body') => {
|
|||
}
|
||||
await sleep(defaultScrollDelay);
|
||||
}
|
||||
page.removeAllListeners('request');
|
||||
logInfo(`Infinite scroll finished (${stringifyScrollInfo(scrollInfo)} resourcesStats=${JSON.stringify(resourcesStats)})`);
|
||||
} catch (err) {
|
||||
logError('An exception thrown in infiniteScroll()', err);
|
||||
|
|
|
|||
|
|
@ -1,8 +0,0 @@
|
|||
const testInput = {
|
||||
searchString: 'pubs new york',
|
||||
maxCrawledPlaces: 2,
|
||||
};
|
||||
|
||||
module.exports = {
|
||||
testInput,
|
||||
};
|
||||
|
|
@ -1,32 +0,0 @@
|
|||
const { expect } = require('chai');
|
||||
const fs = require('fs');
|
||||
const { spawnSync } = require('child_process');
|
||||
const path = require('path');
|
||||
const writeJson = require('write-json');
|
||||
const { testInput } = require('./config');
|
||||
|
||||
const TEST_STORAGE_FOLDER = 'apify_storage_test';
|
||||
const TEST_KV_STORAGE_FOLDER = 'key_value_stores';
|
||||
|
||||
describe('Crawler', () => {
|
||||
before(() => {
|
||||
spawnSync('cd ..');
|
||||
fs.mkdirSync(TEST_STORAGE_FOLDER);
|
||||
fs.mkdirSync(path.join(TEST_STORAGE_FOLDER, TEST_KV_STORAGE_FOLDER));
|
||||
|
||||
const defaultKvsDir = path.join(TEST_STORAGE_FOLDER, TEST_KV_STORAGE_FOLDER, 'default');
|
||||
fs.mkdirSync(defaultKvsDir);
|
||||
writeJson.sync(path.join(defaultKvsDir, 'INPUT.json'), testInput);
|
||||
});
|
||||
it('should work on local', () => {
|
||||
spawnSync('npm', ['run', 'start'], {
|
||||
env: {
|
||||
APIFY_LOCAL_STORAGE_DIR: `./${TEST_STORAGE_FOLDER}`,
|
||||
},
|
||||
});
|
||||
expect('1').to.be.equal('1');
|
||||
});
|
||||
after(() => {
|
||||
spawnSync('cd test');
|
||||
});
|
||||
});
|
||||
Loading…
Reference in New Issue
Block a user