Fixed macCrawledPlaces, infiniteScroll

This commit is contained in:
drobnikj 2019-01-08 10:47:27 +01:00
parent ce135fb8e8
commit 9918348e29
5 changed files with 6 additions and 44 deletions

View File

@ -28,7 +28,7 @@ const enqueueAllUrlsFromPagination = async (page, requestQueue, paginationFrom,
log.info(`Added to queue ${url}`); log.info(`Added to queue ${url}`);
if (maxPlacesPerCrawl && paginationFrom + resultIndex + 1 > maxPlacesPerCrawl) { if (maxPlacesPerCrawl && paginationFrom + resultIndex + 1 > maxPlacesPerCrawl) {
log.info(`Reach max places per crawl ${maxPlacesPerCrawl}, stopped enqueuing new places.`); log.info(`Reach max places per crawl ${maxPlacesPerCrawl}, stopped enqueuing new places.`);
break; return true;
} }
await page.click('.section-back-to-list-button'); await page.click('.section-back-to-list-button');
@ -68,6 +68,7 @@ const enqueueAllPlaceDetails = async (page, searchString, requestQueue, maxPlace
// In case there is listing, go through all details, limits with maxPlacesPerCrawl // In case there is listing, go through all details, limits with maxPlacesPerCrawl
const nextButtonSelector = '[jsaction="pane.paginationSection.nextPage"]'; const nextButtonSelector = '[jsaction="pane.paginationSection.nextPage"]';
let isFinished;
while (true) { while (true) {
await page.waitForSelector(nextButtonSelector, { timeout: DEFAULT_TIMEOUT }); await page.waitForSelector(nextButtonSelector, { timeout: DEFAULT_TIMEOUT });
const paginationText = await page.$eval('.n7lv7yjyC35__right', (el) => el.innerText); const paginationText = await page.$eval('.n7lv7yjyC35__right', (el) => el.innerText);
@ -78,17 +79,17 @@ const enqueueAllPlaceDetails = async (page, searchString, requestQueue, maxPlace
log.debug(`Skiped pagination ${from} - ${to}, already done!`); log.debug(`Skiped pagination ${from} - ${to}, already done!`);
} else { } else {
log.debug(`Added links from pagination ${from} - ${to}`); log.debug(`Added links from pagination ${from} - ${to}`);
await enqueueAllUrlsFromPagination(page, requestQueue, from, maxPlacesPerCrawl); isFinished = await enqueueAllUrlsFromPagination(page, requestQueue, from, maxPlacesPerCrawl);
listingPagination.from = from; listingPagination.from = from;
listingPagination.to = to; listingPagination.to = to;
await Apify.setValue(LISTING_PAGINATION_KEY, listingPagination); await Apify.setValue(LISTING_PAGINATION_KEY, listingPagination);
} }
await page.waitForSelector(nextButtonSelector, { timeout: DEFAULT_TIMEOUT }); if (!isFinished) await page.waitForSelector(nextButtonSelector, { timeout: DEFAULT_TIMEOUT });
const isNextPaginationDisabled = await page.evaluate((nextButtonSelector) => { const isNextPaginationDisabled = await page.evaluate((nextButtonSelector) => {
return !!$(nextButtonSelector).attr('disabled'); return !!$(nextButtonSelector).attr('disabled');
}, nextButtonSelector); }, nextButtonSelector);
const noResultsEl = await page.$('.section-no-result-title'); const noResultsEl = await page.$('.section-no-result-title');
if (isNextPaginationDisabled || noResultsEl || (maxPlacesPerCrawl && maxPlacesPerCrawl < to)) { if (isNextPaginationDisabled || noResultsEl || (maxPlacesPerCrawl && maxPlacesPerCrawl <= to) || isFinished) {
break; break;
} else { } else {
// NOTE: puppeteer API click() didn't work :| // NOTE: puppeteer API click() didn't work :|

View File

@ -119,6 +119,7 @@ module.exports = async (page, maxHeight, elementToScroll = 'body') => {
} }
await sleep(defaultScrollDelay); await sleep(defaultScrollDelay);
} }
page.removeAllListeners('request');
logInfo(`Infinite scroll finished (${stringifyScrollInfo(scrollInfo)} resourcesStats=${JSON.stringify(resourcesStats)})`); logInfo(`Infinite scroll finished (${stringifyScrollInfo(scrollInfo)} resourcesStats=${JSON.stringify(resourcesStats)})`);
} catch (err) { } catch (err) {
logError('An exception thrown in infiniteScroll()', err); logError('An exception thrown in infiniteScroll()', err);

View File

@ -1,8 +0,0 @@
const testInput = {
searchString: 'pubs new york',
maxCrawledPlaces: 2,
};
module.exports = {
testInput,
};

View File

@ -1,32 +0,0 @@
const { expect } = require('chai');
const fs = require('fs');
const { spawnSync } = require('child_process');
const path = require('path');
const writeJson = require('write-json');
const { testInput } = require('./config');
const TEST_STORAGE_FOLDER = 'apify_storage_test';
const TEST_KV_STORAGE_FOLDER = 'key_value_stores';
describe('Crawler', () => {
before(() => {
spawnSync('cd ..');
fs.mkdirSync(TEST_STORAGE_FOLDER);
fs.mkdirSync(path.join(TEST_STORAGE_FOLDER, TEST_KV_STORAGE_FOLDER));
const defaultKvsDir = path.join(TEST_STORAGE_FOLDER, TEST_KV_STORAGE_FOLDER, 'default');
fs.mkdirSync(defaultKvsDir);
writeJson.sync(path.join(defaultKvsDir, 'INPUT.json'), testInput);
});
it('should work on local', () => {
spawnSync('npm', ['run', 'start'], {
env: {
APIFY_LOCAL_STORAGE_DIR: `./${TEST_STORAGE_FOLDER}`,
},
});
expect('1').to.be.equal('1');
});
after(() => {
spawnSync('cd test');
});
});