Commit d6b65d3 1 parent 7ea888e commit d6b65d3 Copy full SHA for d6b65d3
File tree 3 files changed +86
-0
lines changed
3 files changed +86
-0
lines changed Original file line number Diff line number Diff line change
1
+ {
2
+ "name" : " GetHTMLPiece" ,
3
+ "description" : " GetHTMLPiece" ,
4
+ "dependency" : {
5
+ "dockerfile" : " Dockerfile_1"
6
+ },
7
+ "container_resources" : {
8
+ "requests" : {
9
+ "cpu" : 100 ,
10
+ "memory" : 128
11
+ },
12
+ "limits" : {
13
+ "cpu" : 1000 ,
14
+ "memory" : 1024
15
+ }
16
+ },
17
+ "tags" : [
18
+ " browser" ,
19
+ " chrome" ,
20
+ " web" ,
21
+ " selenium"
22
+ ],
23
+ "style" : {
24
+ "node_label" : " Get HTML Piece" ,
25
+ "icon_class_name" : " skill-icons:selenium"
26
+ }
27
+ }
Original file line number Diff line number Diff line change
1
+ from pydantic import BaseModel , Field
2
+ from typing import List
3
+
4
+ class InputModel (BaseModel ):
5
+ """
6
+ Selenium Web Browser Input Model
7
+ """
8
+
9
+ get_page_html : List [str ] = Field (
10
+ default = ["http://www.google.com.br" ],
11
+ description = "URL you want to extract HTML" ,
12
+ )
13
+
14
+ class OutputModel (BaseModel ):
15
+ """
16
+ Selenium Web Browser Output Model
17
+ """
18
+ output_file_path : str = Field (
19
+ description = "Path for pickle file with a list of all HTML files combined."
20
+ )
Original file line number Diff line number Diff line change
1
+ from domino .base_piece import BasePiece
2
+ from .models import InputModel , OutputModel
3
+ from selenium import webdriver
4
+ from selenium .webdriver .chrome .options import Options
5
+ from pathlib import Path
6
+ from typing import List
7
+ import uuid
8
+ import pickle
9
+
10
+ class GetHTMLPiece (BasePiece ):
11
+
12
+ def piece_function (self , input_data : InputModel ) -> OutputModel :
13
+ results_path = Path (self .results_path )
14
+ outputs : List [str ] = []
15
+
16
+ self .logger .info ("Create chrome options arguments." )
17
+
18
+ options = Options ()
19
+ options .add_argument ("--headless=new" )
20
+ options .add_argument ("--no-sandbox" )
21
+
22
+ self .logger .info ("Start chrome web driver." )
23
+ driver = webdriver .Chrome (options = options )
24
+
25
+ for url in input_data .get_page_html :
26
+ driver .get (url )
27
+ page_html = driver .page_source
28
+ outputs .append (page_html )
29
+
30
+ driver .quit ()
31
+
32
+ file_name = f"{ results_path } /{ uuid .uuid4 ()} .pkl"
33
+
34
+ with open (file_name ,"wb" ) as file :
35
+ pickle .dump (outputs , file )
36
+
37
+ return OutputModel (
38
+ output_file_path = file_name
39
+ )
You can’t perform that action at this time.
0 commit comments