diff --git a/src/enqueue_places_crawler.js b/src/enqueue_places_crawler.js index 557a33c..3f846d0 100644 --- a/src/enqueue_places_crawler.js +++ b/src/enqueue_places_crawler.js @@ -28,7 +28,7 @@ const enqueueAllUrlsFromPagination = async (page, requestQueue, paginationFrom, log.info(`Added to queue ${url}`); if (maxPlacesPerCrawl && paginationFrom + resultIndex + 1 > maxPlacesPerCrawl) { log.info(`Reach max places per crawl ${maxPlacesPerCrawl}, stopped enqueuing new places.`); - break; + return true; } await page.click('.section-back-to-list-button'); @@ -68,6 +68,7 @@ const enqueueAllPlaceDetails = async (page, searchString, requestQueue, maxPlace // In case there is listing, go through all details, limits with maxPlacesPerCrawl const nextButtonSelector = '[jsaction="pane.paginationSection.nextPage"]'; + let isFinished; while (true) { await page.waitForSelector(nextButtonSelector, { timeout: DEFAULT_TIMEOUT }); const paginationText = await page.$eval('.n7lv7yjyC35__right', (el) => el.innerText); @@ -78,17 +79,17 @@ const enqueueAllPlaceDetails = async (page, searchString, requestQueue, maxPlace log.debug(`Skiped pagination ${from} - ${to}, already done!`); } else { log.debug(`Added links from pagination ${from} - ${to}`); - await enqueueAllUrlsFromPagination(page, requestQueue, from, maxPlacesPerCrawl); + isFinished = await enqueueAllUrlsFromPagination(page, requestQueue, from, maxPlacesPerCrawl); listingPagination.from = from; listingPagination.to = to; await Apify.setValue(LISTING_PAGINATION_KEY, listingPagination); } - await page.waitForSelector(nextButtonSelector, { timeout: DEFAULT_TIMEOUT }); + if (!isFinished) await page.waitForSelector(nextButtonSelector, { timeout: DEFAULT_TIMEOUT }); const isNextPaginationDisabled = await page.evaluate((nextButtonSelector) => { return !!$(nextButtonSelector).attr('disabled'); }, nextButtonSelector); const noResultsEl = await page.$('.section-no-result-title'); - if (isNextPaginationDisabled || noResultsEl || (maxPlacesPerCrawl && maxPlacesPerCrawl < to)) { + if (isNextPaginationDisabled || noResultsEl || (maxPlacesPerCrawl && maxPlacesPerCrawl <= to) || isFinished) { break; } else { // NOTE: puppeteer API click() didn't work :| diff --git a/src/infinite_scroll.js b/src/infinite_scroll.js index 2eeb850..315c52b 100644 --- a/src/infinite_scroll.js +++ b/src/infinite_scroll.js @@ -119,6 +119,7 @@ module.exports = async (page, maxHeight, elementToScroll = 'body') => { } await sleep(defaultScrollDelay); } + page.removeAllListeners('request'); logInfo(`Infinite scroll finished (${stringifyScrollInfo(scrollInfo)} resourcesStats=${JSON.stringify(resourcesStats)})`); } catch (err) { logError('An exception thrown in infiniteScroll()', err); diff --git a/test/config.js b/test/config.js deleted file mode 100644 index 7936586..0000000 --- a/test/config.js +++ /dev/null @@ -1,8 +0,0 @@ -const testInput = { - searchString: 'pubs new york', - maxCrawledPlaces: 2, -}; - -module.exports = { - testInput, -}; diff --git a/test/crawler.local.js b/test/crawler.local.js deleted file mode 100644 index 83771b6..0000000 --- a/test/crawler.local.js +++ /dev/null @@ -1,32 +0,0 @@ -const { expect } = require('chai'); -const fs = require('fs'); -const { spawnSync } = require('child_process'); -const path = require('path'); -const writeJson = require('write-json'); -const { testInput } = require('./config'); - -const TEST_STORAGE_FOLDER = 'apify_storage_test'; -const TEST_KV_STORAGE_FOLDER = 'key_value_stores'; - -describe('Crawler', () => { - before(() => { - spawnSync('cd ..'); - fs.mkdirSync(TEST_STORAGE_FOLDER); - fs.mkdirSync(path.join(TEST_STORAGE_FOLDER, TEST_KV_STORAGE_FOLDER)); - - const defaultKvsDir = path.join(TEST_STORAGE_FOLDER, TEST_KV_STORAGE_FOLDER, 'default'); - fs.mkdirSync(defaultKvsDir); - writeJson.sync(path.join(defaultKvsDir, 'INPUT.json'), testInput); - }); - it('should work on local', () => { - spawnSync('npm', ['run', 'start'], { - env: { - APIFY_LOCAL_STORAGE_DIR: `./${TEST_STORAGE_FOLDER}`, - }, - }); - expect('1').to.be.equal('1'); - }); - after(() => { - spawnSync('cd test'); - }); -}); diff --git a/test/crawler.platform.js b/test/crawler.platform.js deleted file mode 100644 index e69de29..0000000