Skip to content

Commit d6b65d3

Browse files
committed
new piece
1 parent 7ea888e commit d6b65d3

File tree

3 files changed

+86
-0
lines changed

3 files changed

+86
-0
lines changed

pieces/GetHTMLPiece/metadata.json

+27
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
{
2+
"name": "GetHTMLPiece",
3+
"description": "GetHTMLPiece",
4+
"dependency": {
5+
"dockerfile": "Dockerfile_1"
6+
},
7+
"container_resources": {
8+
"requests": {
9+
"cpu": 100,
10+
"memory": 128
11+
},
12+
"limits": {
13+
"cpu": 1000,
14+
"memory": 1024
15+
}
16+
},
17+
"tags": [
18+
"browser",
19+
"chrome",
20+
"web",
21+
"selenium"
22+
],
23+
"style": {
24+
"node_label": "Get HTML Piece",
25+
"icon_class_name": "skill-icons:selenium"
26+
}
27+
}

pieces/GetHTMLPiece/models.py

+20
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
from pydantic import BaseModel, Field
2+
from typing import List
3+
4+
class InputModel(BaseModel):
5+
"""
6+
Selenium Web Browser Input Model
7+
"""
8+
9+
get_page_html: List[str] = Field(
10+
default=["http://www.google.com.br"],
11+
description="URL you want to extract HTML",
12+
)
13+
14+
class OutputModel(BaseModel):
15+
"""
16+
Selenium Web Browser Output Model
17+
"""
18+
output_file_path: str = Field(
19+
description="Path for pickle file with a list of all HTML files combined."
20+
)

pieces/GetHTMLPiece/piece.py

+39
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
from domino.base_piece import BasePiece
2+
from .models import InputModel, OutputModel
3+
from selenium import webdriver
4+
from selenium.webdriver.chrome.options import Options
5+
from pathlib import Path
6+
from typing import List
7+
import uuid
8+
import pickle
9+
10+
class GetHTMLPiece(BasePiece):
11+
12+
def piece_function(self, input_data: InputModel) -> OutputModel:
13+
results_path = Path(self.results_path)
14+
outputs: List[str] = []
15+
16+
self.logger.info("Create chrome options arguments.")
17+
18+
options = Options()
19+
options.add_argument("--headless=new")
20+
options.add_argument("--no-sandbox")
21+
22+
self.logger.info("Start chrome web driver.")
23+
driver = webdriver.Chrome(options=options)
24+
25+
for url in input_data.get_page_html:
26+
driver.get(url)
27+
page_html = driver.page_source
28+
outputs.append(page_html)
29+
30+
driver.quit()
31+
32+
file_name = f"{results_path}/{uuid.uuid4()}.pkl"
33+
34+
with open(file_name,"wb") as file:
35+
pickle.dump(outputs, file)
36+
37+
return OutputModel(
38+
output_file_path=file_name
39+
)

0 commit comments

Comments
 (0)