-
Notifications
You must be signed in to change notification settings - Fork 20
/
Copy pathUrl_Features.py
97 lines (66 loc) · 1.93 KB
/
Url_Features.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
# This file contains all the feature extraction functions.
# Each function extracts a particular feature from URL
import re
from urllib.parse import urlparse
# First Directory Length
def fd_length(url):
urlpath = urlparse(url).path
try:
return len(urlpath.split('/')[1])
except:
return 0
def digit_count(url):
digits = 0
for i in url:
if i.isnumeric():
digits = digits + 1
return digits
def letter_count(url):
letters = 0
for i in url:
if i.isalpha():
letters = letters + 1
return letters
def no_of_dir(url):
urldir = urlparse(url).path
return urldir.count('/')
# Use of IP or not in domain
def having_ip_address(url):
match = re.search(
# IPv4 in hexadecimal
'(([01]?\\d\\d?|2[0-4]\\d|25[0-5])\\.([01]?\\d\\d?|2[0-4]\\d|25[0-5])\\.([01]?\\d\\d?|2[0-4]\\d|25[0-5])\\.'
'([01]?\\d\\d?|2[0-4]\\d|25[0-5])\\/)|' # IPv4
'((0x[0-9a-fA-F]{1,2})\\.(0x[0-9a-fA-F]{1,2})\\.(0x[0-9a-fA-F]{1,2})\\.(0x[0-9a-fA-F]{1,2})\\/)'
'(?:[a-fA-F0-9]{1,4}:){7}[a-fA-F0-9]{1,4}', url) # Ipv6
if match:
# print match.group()
return -1
else:
# print 'No matching pattern found'
return 1
def hostname_length(url):
return len(urlparse(url).netloc)
def url_length(url):
return len(urlparse(url).path)
# Gets all count features
def get_counts(url):
count_features = []
i = url.count('-')
count_features.append(i)
i = url.count('@')
count_features.append(i)
i = url.count('?')
count_features.append(i)
i = url.count('%')
count_features.append(i)
i = url.count('.')
count_features.append(i)
i = url.count('=')
count_features.append(i)
i = url.count('http')
count_features.append(i)
i = url.count('https')
count_features.append(i)
i = url.count('www')
count_features.append(i)
return count_features