Skip to content

Commit

Permalink
(fix) removing pupeteer to fetch page content in order to increase re…
Browse files Browse the repository at this point in the history
…quest processing and reduce the head usage when running in Docker contaner
  • Loading branch information
NemesisX1 committed Oct 25, 2023
1 parent 9bfe3b8 commit aaee036
Show file tree
Hide file tree
Showing 5 changed files with 93 additions and 68 deletions.
9 changes: 0 additions & 9 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,14 +1,5 @@
FROM node:16

RUN apt-get update && apt-get install gnupg wget -y && \
wget --quiet --output-document=- https://dl-ssl.google.com/linux/linux_signing_key.pub | gpg --dearmor > /etc/apt/trusted.gpg.d/google-archive.gpg && \
sh -c 'echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google.list' && \
apt-get update && \
apt-get install google-chrome-stable -y --no-install-recommends && \
rm -rf /var/lib/apt/lists/*

RUN apt-get clean

WORKDIR /app

COPY package*.json ./
Expand Down
6 changes: 1 addition & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,8 @@ npx ts-express

And the code architecture is mainly inspired by the default [Nest.js](https://nestjs.com/) one.

The core scrapper is made up of these two packages:
The core scrapper is made up of these packages:

- [Puppeteer](https://pptr.dev/) who act as the main scrapper
- [node-html-parser](https://www.npmjs.com/package/node-html-parser) who helps to parse the html document (thinking about removing this one later, I really guess that i can fully rely on Puppeteer)

> ### 🚀 Deployment
Expand All @@ -40,9 +39,6 @@ The core scrapper is made up of these two packages:
sudo docker compose up -d
```



Then go to `localhost:3213` or (localhost:PORT with PORT defined in the .env file)


**Elikem Medehou** [![Twitter Follow](https://img.shields.io/twitter/follow/juniormedehou_?label=Follow&style=social)](https://twitter.com/juniormedehou_)
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
"build": "sh etc/build.sh",
"prod": "npm build && cd dist && npm i && node bin/www.js",
"docs": "ts-node src/swagger.ts",
"test": "jest --runInBand --detectOpenHandles --forceExit"
"test": "jest --runInBand --detectOpenHandles --forceExit --verbose"
},
"dependencies": {
"@sentry/node": "^7.74.1",
Expand Down
126 changes: 74 additions & 52 deletions src/services/scapping/scrapping.servive.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ import TheaterMovieBriefModel from "../../models/theater-movie-brief.model";
import TheaterDiffusionInfoModel from "../../models/theater-movie-diffusion-info.model";
import TheaterInfosModel from "../../models/theater-info.model";
import TheaterNameModel from "@/models/theater-name.model";
import axios from "axios";
import axios, { AxiosError, AxiosResponse } from "axios";
import * as cheerio from 'cheerio';

export default class ScrappingService implements BaseService {
Expand All @@ -21,27 +21,28 @@ export default class ScrappingService implements BaseService {
*/
public async availableMovies(lang: string = 'fr'): Promise<TheaterMovieBriefModel[]> {

const browser = await puppeteer.launch({ headless: 'new', args: ['--no-sandbox', '--disable-setuid-sandbox'] });
const page = await browser.newPage();
let response: AxiosResponse;

try {

await page.goto(
lang == 'en'
? `${infos.baseUrl}/en`
: `${infos.baseUrl}`);
response = await axios.get(lang == 'en' ? `${infos.baseUrl}/en` : `${infos.baseUrl}`);

} catch (error) {

const e = error as AxiosError;

this.logger.fatal('availableMovies');
this.logger.fatal((error as Error).message);
this.logger.fatal(e.message);

throw Error((error as Error).message);
if (e.response?.status == 404)
return [];
else
throw Error(e.message);
}

const result: TheaterMovieBriefModel[] = [];

const htmlRoot = parse(await page.content());
const htmlRoot = parse(response.data);

const aMovieList = htmlRoot.querySelectorAll('section.homepage-affiche > div.wrapper > div.homepage-affiche-list > a.homepage-affiche-list-movie');

Expand All @@ -61,7 +62,6 @@ export default class ScrappingService implements BaseService {
}
})

await browser.close();

return result;
}
Expand All @@ -72,9 +72,24 @@ export default class ScrappingService implements BaseService {
// TODO: rewrite this one by fetching theaters list directly from https://www.xml-sitemaps.com/download/www.canalolympia.com-52d54e4ae/sitemap.xml?view=1
public async theatersNames(): Promise<TheaterNameModel[]> {
const theaters: TheaterNameModel[] = [];
let response: AxiosResponse;

try {
const response = await axios.get(infos.baseUrl);

response = await axios.get(infos.baseUrl);

} catch (error) {

const e = error as AxiosError;

this.logger.fatal('theatersNames');
this.logger.fatal(e);

throw Error(e.message);
}

try {

const htmlRoot = cheerio.load(response.data);

// Get all anchor tags inside li elements with the specified class
Expand Down Expand Up @@ -135,6 +150,7 @@ export default class ScrappingService implements BaseService {
}
}
} catch (error) {

this.logger.fatal('theater names');
this.logger.fatal((error as Error).message);

Expand All @@ -151,35 +167,38 @@ export default class ScrappingService implements BaseService {
*/
public async theaterMovies(theaterName: string, lang: string = 'fr'): Promise<TheaterMovieBriefModel[]> {

const browser = await puppeteer.launch({ headless: 'new', args: ['--no-sandbox', '--disable-setuid-sandbox'] });
const page = await browser.newPage();
let response: AxiosResponse;

try {

await page.goto(
lang == 'en'
? `${infos.baseUrl}/en/${infos.theatersUrl}/${theaterName}-en`
: `${infos.baseUrl}/${infos.theatersUrl}/${theaterName}`);
response = await axios.get(lang == 'en'
? `${infos.baseUrl}/en/${infos.theatersUrl}/${theaterName}-en`
: `${infos.baseUrl}/${infos.theatersUrl}/${theaterName}`);

} catch (error) {

const e = error as AxiosError;

this.logger.fatal('movies');
this.logger.fatal((error as Error).message);
this.logger.fatal(e.message);

throw Error((error as Error).message);
if (e.response?.status == 404)
return [];
else
throw Error(e.message);
}

const elements = await page.$$('ul[data-date].theater-movies');
const htmlRoot = parse(response.data);

const elements = htmlRoot.querySelectorAll('ul[data-date].theater-movies');

const result: TheaterMovieBriefModel[] = [];


/// TODO: reduce complexity by using in a more efficient way the html parser and its querySelector
for (const element of elements) {

const text = await page.evaluate(el => el.outerHTML, element);

const root = parse(text);
const root = parse(element.outerHTML);

const rawDate = root.querySelector('ul')?.rawAttributes['data-date'] as string;

Expand Down Expand Up @@ -211,8 +230,6 @@ export default class ScrappingService implements BaseService {
});
}

await browser.close();

return result;
}

Expand All @@ -223,26 +240,31 @@ export default class ScrappingService implements BaseService {
*/
public async movieInfoBySlug(slug: string, lang: string = 'fr'): Promise<TheaterMovieModel | null> {

const browser = await puppeteer.launch({ headless: 'new', args: ['--no-sandbox', '--disable-setuid-sandbox'] });
const page = await browser.newPage();
let response: AxiosResponse;

const cleanSlug = slug.replace('-en', '');

try {

await page.goto(lang == 'en'
response = await axios.get(lang == 'en'
? `${infos.baseUrl}/en/${infos.moviesUrl}/${cleanSlug}-en`
: `${infos.baseUrl}/${infos.moviesUrl}/${cleanSlug}`);

} catch (error) {

const e = error as AxiosError;

this.logger.fatal('movieInfoBySlug');
this.logger.fatal(error);
this.logger.fatal(e.message);

if (e.response?.status == 404)
return null;
else
throw Error(e.message);

throw Error((error as Error).message);
}

const htmlRoot = parse(await page.content());
const htmlRoot = parse(response.data);

const title = htmlRoot.querySelector('div.movie-top-container-cover-content > h1')?.textContent;

Expand All @@ -254,7 +276,6 @@ export default class ScrappingService implements BaseService {
const brief = htmlRoot.querySelector('div.synopse-modal > p')?.textContent;
const trailerUrl = htmlRoot.querySelector('div.wrapper > div.movie > iframe')?.rawAttributes.src;


const TheaterMovie: TheaterMovieModel = {
title: title,
genre: genre!,
Expand All @@ -264,8 +285,6 @@ export default class ScrappingService implements BaseService {
trailerLink: trailerUrl!,
};

await browser.close();

return TheaterMovie;
}

Expand All @@ -276,26 +295,31 @@ export default class ScrappingService implements BaseService {
*/
public async movieDiffusionInfos(slug: string, lang: string = 'fr', theaterName?: string): Promise<TheaterDiffusionInfoModel[]> {

const browser = await puppeteer.launch({ headless: 'new', args: ['--no-sandbox', '--disable-setuid-sandbox'] });
const page = await browser.newPage();
let response: AxiosResponse;

const cleanSlug = slug.replace('-en', '');

try {

await page.goto(lang == 'en'
response = await axios.get(lang == 'en'
? `${infos.baseUrl}/en/${infos.moviesUrl}/${cleanSlug}-en`
: `${infos.baseUrl}/${infos.moviesUrl}/${cleanSlug}`);


} catch (error) {

const e = error as AxiosError;

this.logger.fatal('movieDiffusionInfos');
this.logger.fatal((error as Error).message);
this.logger.fatal(e.message);

throw Error((error as Error).message);
if (e.response?.status == 404)
return [];
else
throw Error(e.message);
}

const htmlRoot = parse(await page.content());
const htmlRoot = parse(response.data);

const sessionsInfos = htmlRoot.querySelector('div.sessions');

Expand Down Expand Up @@ -341,9 +365,7 @@ export default class ScrappingService implements BaseService {
dates: dates,
})
}
})

await browser.close();
});

return diffusionInfos;
}
Expand All @@ -355,28 +377,30 @@ export default class ScrappingService implements BaseService {
*/
public async theaterInfos(theaterName: string, lang: string = 'fr'): Promise<TheaterInfosModel> {

const browser = await puppeteer.launch({ headless: 'new', args: ['--no-sandbox', '--disable-setuid-sandbox'] });
const page = await browser.newPage();
let response: AxiosResponse;

const cleanedTheaterName = theaterName.replace('-en', '');

try {

await page.goto(lang == 'en'
response = await axios.get(lang == 'en'
? `${infos.baseUrl}/en/${infos.theatersUrl}/${cleanedTheaterName}-en`
: `${infos.baseUrl}/${infos.theatersUrl}/${cleanedTheaterName}`);
: `${infos.baseUrl}/${infos.theatersUrl}/${cleanedTheaterName}`)


} catch (error) {

const e = error as AxiosError;

this.logger.fatal('theaterInfos');
this.logger.fatal(error);

throw Error((error as Error).message);
throw Error(e.message);

}


const htmlRoot = parse(await page.content());
const htmlRoot = parse(response.data);

const name = htmlRoot.querySelector('div.theater-top-container-cover-content > h1')?.textContent;
const location = htmlRoot.querySelector('div.theater-top-container-cover-content > a')?.textContent;
Expand Down Expand Up @@ -418,8 +442,6 @@ export default class ScrappingService implements BaseService {
media: media,
};

await browser.close();

return theaterInfos;
}

Expand Down
18 changes: 17 additions & 1 deletion src/tests/services/scrapping.service.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ describe('Test on Scapping Service', () => {
test('if movieInfoBySlug is not working with bad slug', async () => {

return scrappingService.movieInfoBySlug('zkk').then((info) => {

expect(info).toBeNull();

});
Expand Down Expand Up @@ -151,6 +151,22 @@ describe('Test on Scapping Service', () => {

});


test('if theaterInfos is working with lang en', async () => {

return scrappingService.theaterInfos('wologuede', 'en').then((infos) => {

expect(infos).not.toBeNull();
expect(infos.name).toBeDefined();
expect(infos.location).toBeDefined();
expect(infos.locationUrl).toBeDefined();
expect(infos.pricing.length).toBeGreaterThan(0);
expect(infos.media.length).toBeGreaterThan(0);

});

});

test('if availableMovies is working', async () => {

return scrappingService.availableMovies().then((movies) => {
Expand Down

0 comments on commit aaee036

Please sign in to comment.