-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrapper.h
155 lines (134 loc) · 4.78 KB
/
scrapper.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
// gets sitemap from cassandra and using it to crawl and scrapes the data from the webpages, and returns the data in json format to be accessed later probably create an API, the kind of data is ecommerce data things like price, title, description, image, etc.
#ifndef SCRAPPER_H
#define SCRAPPER_H
#include <iostream>
#include <string>
#include <vector>
#include <fstream>
#include <sstream>
#include <thread>
#include <chrono>
#include <cstdlib>
#include <mutex>
#include <condition_variable>
#include <nlohmann/json.hpp>
#include <boost/interprocess/shared_memory_object.hpp>
#include <boost/interprocess/mapped_region.hpp>
#include "db.h"
#include "WebPageFetcher.h"
using namespace std;
using json = nlohmann::json;
class Scrapper {
private:
// cassandra variables
mutex mtx;
condition_variable cv;
CassSession* session;
std::ifstream jsonFile;
json jsonUrls;
std::vector<std::string> siteUrls;
DB db;
WebPageFetcher wpf;
std::string webpage;
std::vector<std::string> sitemap;
private:
void get_urls_from_json() {
jsonFile.open("urls.json");
if (!jsonFile.is_open()) {
std::cerr << "Error opening JSON file" << std::endl;
return;
}
jsonFile >> jsonUrls;
siteUrls = jsonUrls;
}
void process_url(std::string siteUrl){
get_sitemap_for_url(siteUrl);
fetch_and_signal_python_scripts();
}
void get_sitemap_for_url(std::string siteUrl) {
std::vector<std::string> sitemapData;
sitemapData = db.getSitemapFromCassandra(session, siteUrl);
if (!sitemapData.empty()) {
sitemap = sitemapData;
}
}
void fetch_and_signal_python_scripts() {
for (auto siteMapUrl : sitemap) {
fetch_and_signal_python_script(siteMapUrl);
}
}
void fetch_and_signal_python_script(std::string siteMapUrl) {
webpage = wpf.fetch(siteMapUrl);
try{
//shared memory object
boost::interprocess::shared_memory_object shm(
boost::interprocess::open_or_create,
"webpage",
boost::interprocess::read_write
);
// set size
shm.truncate(webpage.size() + 1);
// map the whole shared memory in this process
boost::interprocess::mapped_region region(
shm,
boost::interprocess::read_write
);
// copy data to shared memory
std::memcpy(region.get_address(), webpage.c_str(), webpage.size() + 1);
//signal python script
signal_python_script();
} catch (boost::interprocess::interprocess_exception &ex) {
std::cout << "BOOST::EXCEPTION: " << ex.what() << std::endl;
}
}
public:
Scrapper() {
get_urls_from_json();
db.connect();
session = db.getSession();
}
~Scrapper() {
jsonFile.close();
db.close();
}
void run(){
std::cout << "Scrapper running..." << std::endl;
std::vector<std::thread> threads;
for (const auto& siteurl : siteUrls) {
threads.push_back(std::thread(&Scrapper::process_url, this, siteurl));
}
for (auto& thread : threads) {
thread.join();
}
}
// get site map from cassandra
void get_sitemap(){
session = db.getSession();
for (auto url : siteUrls) {
sitemap = db.getSitemapFromCassandra(session, url);
}
}
// signal python script to start processing the data using a named pipe
void signal_python_script() {
std::ofstream pipe;
pipe.open("pipe", std::ios::out);
pipe << "1";
pipe.close();
// Wait for the named pipe to be created
const std::string pipePath = "pipe";
const std::chrono::milliseconds waitTime(100);
while (!std::ifstream(pipePath)) {
std::this_thread::sleep_for(waitTime);
}
//run python script
const std::string pythonScript = "html_parser.py";
std::string command = "python3 " + pythonScript;
int status = std::system(command.c_str());
if (status == 0) {
std::cout << "Python script ran successfully" << std::endl;
} else {
std::cout << "Python script failed to run" << std::endl;
}
}
};
#endif