diff --git a/src/infinite_scroll.js b/src/infinite_scroll.js index df08416..4544a92 100644 --- a/src/infinite_scroll.js +++ b/src/infinite_scroll.js @@ -2,7 +2,6 @@ const Apify = require('apify'); const { sleep, log } = Apify.utils; -const logError = (msg, e) => log.exception(e, msg); const logInfo = (msg) => log.info(msg); const logDebug = (msg) => log.debug(msg); @@ -76,7 +75,7 @@ module.exports = async (page, maxHeight, elementToScroll = 'body') => { let scrollInfo = await getPageScrollInfo(page, elementToScroll); logInfo(`Infinite scroll started (${stringifyScrollInfo(scrollInfo)}).`); - let previosReviewsCount = 0; + let previousReviewsCount = 0; while (true) { scrollInfo = await getPageScrollInfo(page, elementToScroll); @@ -104,14 +103,14 @@ module.exports = async (page, maxHeight, elementToScroll = 'body') => { const reviewsCount = await page.evaluate(() => $('div.section-review').length); /** - * If the page is scrolled to the very bottom or beyond - * maximum height and loader is not displayed and we don't find new reviews, we are done. - */ - if (reviewsCount === previosReviewsCount + * If the page is scrolled to the very bottom or beyond + * maximum height and loader is not displayed and we don't find new reviews, we are done. + */ + if (reviewsCount === previousReviewsCount && (scrollInfo.scrollTop + scrollInfo.clientHeight >= Math.min(scrollInfo.scrollHeight, maxHeight)) && !isLoaderOnPage ) break; - previosReviewsCount = reviewsCount; + previousReviewsCount = reviewsCount; // Otherwise we try to scroll down await scrollTo(page, elementToScroll, maxHeight); diff --git a/src/main.js b/src/main.js index 4295c53..4c8f54c 100644 --- a/src/main.js +++ b/src/main.js @@ -1,11 +1,12 @@ const Apify = require('apify'); const placesCrawler = require('./places_crawler'); +const resultJsonSchema = require('./result_item_schema'); const { proxyCheck } = require('./proxy_check'); const { log } = Apify.utils; Apify.main(async () => { const input = await Apify.getValue('INPUT'); - const { searchString, proxyConfig, lat, lng, maxCrawledPlaces } = input; + const { searchString, proxyConfig, lat, lng, maxCrawledPlaces, regularTestRun } = input; if (!searchString) throw new Error('Attribute searchString missing in input.'); @@ -36,5 +37,17 @@ Apify.main(async () => { const crawler = placesCrawler.setUpCrawler(launchPuppeteerOptions, requestQueue, maxCrawledPlaces); await crawler.run(); + if (regularTestRun) { + const { defaultDatasetId: datasetId } = Apify.getEnv(); + await Apify.call('drobnikj/check-crawler-results', { + datasetId, + options: { + minOutputtedPages: 5, + jsonSchema: resultJsonSchema, + notifyTo: 'jakub.drobnik@apify.com', + }, + }); + } + log.info('Done!'); }); diff --git a/src/places_crawler.js b/src/places_crawler.js index b76c537..8856255 100644 --- a/src/places_crawler.js +++ b/src/places_crawler.js @@ -154,7 +154,7 @@ const setUpCrawler = (launchPuppeteerOptions, requestQueue, maxCrawledPlaces) => maxRequestRetries: MAX_PAGE_RETRIES, retireInstanceAfterRequestCount: 10, handlePageTimeoutSecs: 15 * 60, // long timeout, because of startUrl enqueueing - maxOpenPagesPerInstance: 1, // Because of startUrl enqueueing crashes if we mix tabs with another scraping + maxOpenPagesPerInstance: 1, // because of startUrl enqueueing crashes if we mix tabs with another scraping }; if (maxCrawledPlaces) { crawlerOpts.maxRequestsPerCrawl = maxCrawledPlaces + 1; // The first one is startUrl diff --git a/src/result_item_schema.json b/src/result_item_schema.json new file mode 100644 index 0000000..04e02c2 --- /dev/null +++ b/src/result_item_schema.json @@ -0,0 +1,250 @@ +{ + "definitions": {}, + "$schema": "http://json-schema.org/draft-07/schema#", + "$id": "http://example.com/root.json", + "type": "object", + "title": "The Root Schema", + "required": [ + "title", + "totalScore", + "categoryName", + "address", + "plusCode", + "reviews", + "reviewsCount", + "imageUrls", + "url" + ], + "properties": { + "title": { + "$id": "#/properties/title", + "type": "string", + "title": "The Title Schema", + "default": "", + "examples": [ + "Bank DLR Station" + ], + "pattern": "^(.*)$" + }, + "totalScore": { + "$id": "#/properties/totalScore", + "type": "number", + "title": "The Totalscore Schema", + "default": 0.0, + "examples": [ + 3.4 + ] + }, + "categoryName": { + "$id": "#/properties/categoryName", + "type": "string", + "title": "The Categoryname Schema", + "default": "", + "examples": [ + "Stanice úzkokolejné dráhy" + ], + "pattern": "^(.*)$" + }, + "address": { + "$id": "#/properties/address", + "type": "string", + "title": "The Address Schema", + "default": "", + "examples": [ + "Cornhill, London EC3V 3NR, Velká Británie" + ], + "pattern": "^(.*)$" + }, + "plusCode": { + "$id": "#/properties/plusCode", + "type": "string", + "title": "The Pluscode Schema", + "default": "", + "examples": [ + "GW76+6H City, London, Velká Británie" + ], + "pattern": "^(.*)$" + }, + "reviews": { + "$id": "#/properties/reviews", + "type": "array", + "title": "The Reviews Schema", + "items": { + "$id": "#/properties/reviews/items", + "type": "object", + "title": "The Items Schema", + "required": [ + "name", + "text", + "stars", + "publishAt", + "likesCount", + "responseFromOwnerText" + ], + "properties": { + "name": { + "$id": "#/properties/reviews/items/properties/name", + "type": "string", + "title": "The Name Schema", + "default": "", + "examples": [ + "selion shkenza" + ], + "pattern": "^(.*)$" + }, + "text": { + "$id": "#/properties/reviews/items/properties/text", + "type": "string", + "title": "The Text Schema", + "default": "", + "examples": [ + "" + ], + "pattern": "^(.*)$" + }, + "stars": { + "$id": "#/properties/reviews/items/properties/stars", + "type": "string", + "title": "The Stars Schema", + "default": "", + "examples": [ + "4 hvězdičky" + ], + "pattern": "^(.*)$" + }, + "publishAt": { + "$id": "#/properties/reviews/items/properties/publishAt", + "type": "string", + "title": "The Publishat Schema", + "default": "", + "examples": [ + "před týdnem" + ], + "pattern": "^(.*)$" + }, + "likesCount": { + "$id": "#/properties/reviews/items/properties/likesCount", + "type": "string", + "title": "The Likescount Schema", + "default": "", + "examples": [ + "" + ], + "pattern": "^(.*)$" + }, + "responseFromOwnerText": { + "$id": "#/properties/reviews/items/properties/responseFromOwnerText", + "type": "string", + "title": "The Responsefromownertext Schema", + "default": "", + "examples": [ + "" + ], + "pattern": "^(.*)$" + } + } + } + }, + "reviewsCount": { + "$id": "#/properties/reviewsCount", + "type": "integer", + "title": "The Reviewscount Schema", + "default": 0, + "examples": [ + 84 + ] + }, + "imageUrls": { + "$id": "#/properties/imageUrls", + "type": "array", + "title": "The Imageurls Schema", + "items": { + "$id": "#/properties/imageUrls/items", + "type": "string", + "title": "The Items Schema", + "default": "", + "examples": [ + "https://upload.wikimedia.org/wikipedia/commons/thumb/f/f3/Bankwbankofengland.jpg/250px-Bankwbankofengland.jpg", + "https://lh5.googleusercontent.com/p/AF1QipNvg2r-Qtsf763s8Lj739j2Y7YcDbv2Pn8W7SgC=s429-k-no", + "https://lh5.googleusercontent.com/p/AF1QipN5fE52t8nJxVe9jQNBChOMc8wfQ_XkonjlpJAD=s1056-k-no-pi-10.949902-ya22.500002-ro0-fo100", + "https://lh5.googleusercontent.com/p/AF1QipM26i-WtnjwdKx6m9w1a0QZxhpmQhoSguiLeW9d=s1056-k-no-pi-21.489584-ya277.58334-ro0-fo100", + "https://lh5.googleusercontent.com/p/AF1QipMe61_YMCRe8Nh_BOu7mpk066C3f4ObxwVddYxI=s553-k-no", + "https://lh5.googleusercontent.com/p/AF1QipOnBW_96gk4yOCUl72HdvBSBB5cfYDV4u1rjbel=s312-k-no", + "https://lh5.googleusercontent.com/p/AF1QipN9xcqOJgfgv48gMAbw5fVy3ptLCg1YsrGSplow=s1056-k-no-pi-28.124998-ya301.67706-ro0-fo100", + "https://lh5.googleusercontent.com/p/AF1QipOTzEjwj6uGo8LhhDP0D9IOs92uTZPFQrjyjhnu=s1056-k-no-pi-16.315458-ya275.4129-ro-3.2435107-fo100", + "https://lh5.googleusercontent.com/p/AF1QipMDovdM-0DVbXrMDgW_Zpt6DV8wEt4VXAyZg87-=s378-k-no", + "https://lh5.googleusercontent.com/p/AF1QipMpXC2H5gPDnNTxAPIWweY_KiZ7a6ul5woEUs2P=s504-k-no", + "https://lh5.googleusercontent.com/p/AF1QipNtFYrNlCak7skLIAX7XPLysdBFC7I_Ymhh_AXl=s1056-k-no-pi0-ya277.5-ro0-fo100", + "https://lh5.googleusercontent.com/p/AF1QipMVhYopcr2KJc8FGHqS7xh1vmlib-_g9ojJmFeD=s792-k-no", + "https://upload.wikimedia.org/wikipedia/commons/thumb/f/f3/Bankwbankofengland.jpg/280px-Bankwbankofengland.jpg", + "https://lh5.googleusercontent.com/p/AF1QipMzkYuzRS0bija8AHXbz96NGbPYJqxR4exmwVti=s429-k-no", + "https://lh5.googleusercontent.com/p/AF1QipMbrVMaGk6eaJCEIpCJVduUEpoR93kule721plf=s1056-k-no-pi-26.666666-ya154.59375-ro0-fo100", + "https://lh5.googleusercontent.com/p/AF1QipOEYIPZY-stihaZ1urLMfj46a1Q1fJWWuU5G2CP=s312-k-no", + "https://lh5.googleusercontent.com/p/AF1QipMM1xV1YW4baOA7tSlzsgFbNh_bOcS9X9fNUYOg=s554-k-no", + "https://lh5.googleusercontent.com/p/AF1QipOS60FjDOTc1UR28D74-tRGcl5ZBL_hLlXAHlcJ=s1056-k-no-pi-9.822917-ya129.01044-ro0-fo100", + "https://lh5.googleusercontent.com/p/AF1QipMMe7P3Vqt55sHWLrmb3uw0h_jj-JS4eC_Umtgw=s700-k-no", + "https://lh5.googleusercontent.com/p/AF1QipPxxpoC6cU6LIZQQamfPaoQURTQFdK4rdmH3jnC=s700-k-no", + "https://c7.alamy.com/compes/eamy4d/la-estacion-de-metro-de-banco-ciudad-de-londres-reino-unido-eamy4d.jpg", + "https://lh5.googleusercontent.com/p/AF1QipP2gh4XzRdq680lkQR6epsQbHiE1rg5NbyYlFtX=s400-k-no", + "https://lh5.googleusercontent.com/p/AF1QipNPn04F7IZzusE5kkbkp2lN5iTX4gTs7HDpyMxy=s792-k-no", + "https://lh5.googleusercontent.com/p/AF1QipPhwHHfdov2yHM-0DhpBTc3-Z2cr0p2QhpImfM=s1056-k-no-pi-13.614583-ya231.11461-ro0-fo100", + "https://lh5.googleusercontent.com/p/AF1QipNfEr2GSCNrRBAGFeN-awjX3w4cJJP-6qv0zfoj=s1056-k-no", + "https://upload.wikimedia.org/wikipedia/commons/thumb/c/ce/City_of_London_arms_at_Bank_station.JPG/250px-City_of_London_arms_at_Bank_station.JPG", + "https://lh5.googleusercontent.com/p/AF1QipPVO1I7UGvUjjRaZ8wY4VYFu1SkOgaq4i9a8hqg=s507-k-no", + "https://lh5.googleusercontent.com/p/AF1QipNHMiaBFc-SjbhXvj4WzOZN3MBwdkSwbR6pdKRF=s1056-k-no-pi-22.297846-ya177.4561-ro-0.10955643-fo100", + "https://lh5.googleusercontent.com/p/AF1QipOf1Mvt84IX0hQgkKO5-3mDRCHqhePUa7pEh2ZR=s338-k-no", + "https://lh5.googleusercontent.com/p/AF1QipPizBCMxmGkgBP4C1NY1G-0T3FHZyDjzqyPbSpc=s451-k-no", + "https://lh5.googleusercontent.com/p/AF1QipPcwZHykJQy-2ZsjUj85VavnwoDiWGFsmd0sJJS=s792-k-no", + "https://lh5.googleusercontent.com/p/AF1QipNnPYABVVjBm-R3feClWKjldxI5qJgi4ROkjcLE=s395-k-no", + "https://lh5.googleusercontent.com/p/AF1QipMIhOEPUTrQhBKTE76sVoQbI3p3_1SEkHLV77XI=s395-k-no", + "https://lh5.googleusercontent.com/p/AF1QipOeXhmlI5CV8dGQUTA5oFrukCHJaEN6193crtHO=s790-k-no", + "https://upload.wikimedia.org/wikipedia/commons/thumb/f/f3/Bankwbankofengland.jpg/220px-Bankwbankofengland.jpg", + "https://lh5.googleusercontent.com/p/AF1QipOql79vIzrePHpilg4y5RDlIQQTtC5SxOIzXqNp=s429-k-no", + "https://lh5.googleusercontent.com/p/AF1QipOwsSkPYWVuhNdpxYj4A25Hd5-A6Z0PLIA6ihXl=s792-k-no", + "https://lh5.googleusercontent.com/p/AF1QipPT6CSUCJtyfMOenTaxQ0KIXxmQIjF9Y1KeiO7g=s504-k-no", + "https://lh5.googleusercontent.com/p/AF1QipPWuFfYtNO3Nmj8I240_XYgV0vOHQzveC_YWMnV=s378-k-no", + "https://lh5.googleusercontent.com/p/AF1QipPsLY1uqtZuJ3XLXoe3wsnKds_vopx2xdIkNmuu=s190-k-no", + "https://lh5.googleusercontent.com/p/AF1QipO6568-hKAna0YkztNhv3k7AakE78FiGMVCaqe2=s290-k-no", + "https://lh5.googleusercontent.com/p/AF1QipO5R_6w9i3gjGtDFS7q5XlrPgod13lqWinpUHRy=s645-k-no", + "https://lh5.googleusercontent.com/p/AF1QipPlWgChBoSWVB8QZeSy5Ov7k48TQsIP_6nl203S=s452-k-no", + "https://lh5.googleusercontent.com/p/AF1QipOjtTDA6gnx_Ou50Sj1XkfQvMWzcos6vJvhapIg=s339-k-no", + "https://lh5.googleusercontent.com/p/AF1QipPLzc6JFM8aRAKxASxNoAf_2EAoXLypdIwmEgTr=s394-k-no", + "https://lh5.googleusercontent.com/p/AF1QipP1QZieXU6jynzlz2GPdvy0f7HHAYtDS6nHD8Sa=s392-k-no", + "https://upload.wikimedia.org/wikipedia/commons/thumb/6/6f/Bank_Tube_Station.jpg/240px-Bank_Tube_Station.jpg", + "https://lh5.googleusercontent.com/p/AF1QipO1bJfpO3ycyffF2Fa9W-ITeToIYvKNjcX6Dhk=s688-k-no", + "https://lh5.googleusercontent.com/p/AF1QipPDRRRxG63uSeBTaRpfVzYi9nLKe4iD-R9jVQJR=s267-k-no", + "https://lh5.googleusercontent.com/p/AF1QipOiMqKWgSegg1lJwzAPyJHeVOvv-f-tEIHoLKYY=s333-k-no", + "https://lh5.googleusercontent.com/p/AF1QipM_rozyQWGD8PZ5xBHmvh5WGpM6wC6xMdJT-2-w=s453-k-no", + "https://lh5.googleusercontent.com/p/AF1QipN8wVq1JjC5tAqfWXT7Pa_dhCiGszEhbS_N0HpX=s504-k-no", + "https://lh5.googleusercontent.com/p/AF1QipMIFKdF8yTSglvxGSzTnD0uKRelpYVVXtkCieWw=s378-k-no", + "https://lh5.googleusercontent.com/p/AF1QipMgnchxTukvmFTvcGemcyaamHn0YqAC-1bHbYq4=s792-k-no", + "https://lh5.googleusercontent.com/p/AF1QipP5RyjKq6Lmsxps7cx2xS6tRLu_rFYGFAghDmCr=s526-k-no", + "https://lh5.googleusercontent.com/p/AF1QipM-41Ei5_Jxpzqx7excrU8SJNxfOqcN4_z3zuBW=s526-k-no", + "https://lh5.googleusercontent.com/p/AF1QipOsR6U39vo-iR4KkbUscaAStFPnwIMQ6s94nrpE=s522-k-no", + "https://lh5.googleusercontent.com/p/AF1QipMWmt7zfzBpBP4qzr1St4p3kmJH8CHjFcP7xoFd=s395-k-no", + "https://lh5.googleusercontent.com/p/AF1QipP6xGDmvADIm7qgaQPB52HggI-NT1eUjEkACUKV=s395-k-no", + "https://lh5.googleusercontent.com/p/AF1QipOX8Xlg2RR7jCoHC5FMSRrfSP9alDbfo1P2KvaO=s792-k-no", + "https://lh5.googleusercontent.com/p/AF1QipNZ0WfoBq7DUm1Fe0PX8GlK_7k5WXxgIRElGUhn=s525-k-no", + "https://lh5.googleusercontent.com/p/AF1QipMl2Zo22ebfOqSkywf90OTavKhlonWX0DqFCuoN=s350-k-no", + "https://lh5.googleusercontent.com/p/AF1QipM-xPa3Lxayw3d19iRmDCvdGJGwQE1LCs3QmIic=s525-k-no", + "https://lh5.googleusercontent.com/p/AF1QipPjxOH0k4S2Xwx2tEODSl-ww8v9z1hUXyDeildV=s350-k-no", + "https://lh5.googleusercontent.com/p/AF1QipOG2wkagJbVMk9T_3--pSQcMkVg_ZBLsUA1gopN=s792-k-no", + "https://lh5.googleusercontent.com/p/AF1QipPPzQBdvA6VUDtshoDlVImog56lrsK4u6oobVw4=s339-k-no", + "https://lh5.googleusercontent.com/p/AF1QipMyrPDDLnXFkOcZhrYLoeY9i1CWTxkLXWXIo15P=s312-k-no", + "https://lh5.googleusercontent.com/p/AF1QipOlvraSBZXdmrq8n7CG9m82EARhCt222FwA-3dj=s555-k-no" + ], + "pattern": "^(.*)$" + } + }, + "url": { + "$id": "#/properties/url", + "type": "string", + "title": "The Url Schema", + "default": "", + "examples": [ + "https://www.google.com/maps/place/Bank+DLR+Station/@51.5131071,-0.0907444,17z/data=!3m1!4b1!4m5!3m4!1s0x487603549b941c59:0xfa641eadd72bb1e2!8m2!3d51.5131071!4d-0.0885557" + ], + "pattern": "^(.*)$" + } + } +}