-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathDirectoryCrawler.py
96 lines (65 loc) · 2.84 KB
/
DirectoryCrawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
# coding=utf-8
"""
DirectoryCrawler.py
This module contains a DirectoryCrawler class, that reads documents from a test/training data
directory and returns a list of documents.
@author David Greisler <s0531301@htw-berlin.de>
@author Paul Kitt <s0528516@htw-berlin.de>
"""
import os
from Document import Document
class DirectoryCrawler(object):
"""
The directory crawler crawls through a directory containing test/training data and returns a
list of documents for a given class name.
The directory crawler expects the following directory layout:
<root path>/<class name>/test/ containing test documents.
<root path>/<class name>/train/ containing training documents.
"""
# The path to the directory containing the test/training documents. Contains trailing slash.
_root_path = ""
def __init__(self, root_path):
"""
Creates a new directory crawler using the given root path.
"""
root_path = os.path.normpath(root_path) + os.sep
self._root_path = root_path
@property
def root_path(self):
"""
Returns the root path of the directory crawler.
"""
return self._root_path
@root_path.setter
def root_path(self, root_path):
"""
Sets the root path of the directory crawler.
Args:
root_path: The new root path.
"""
root_path = os.path.normpath(root_path) + os.sep
self._root_path = root_path
def read_test_documents(self, document_class):
"""
Reads all test documents for the given class name and returns them as list of documents.
Returns:
A list containing all test documents for the given class name.
"""
documents = []
document_paths = sorted(os.listdir(self._root_path + document_class + os.sep + "test"))
for document_path in document_paths:
full_path = self._root_path + document_class + os.sep + "test" + os.sep + document_path
documents.append(Document(full_path))
return documents
def read_training_documents(self, document_class):
"""
Reads all training documents for the given class name and returns them as list of documents.
Returns:
A list containing all training documents for the given class name.
"""
documents = []
document_paths = sorted(os.listdir(self._root_path + document_class + os.sep + "train"))
for document_path in document_paths:
full_path = self._root_path + document_class + os.sep + "train" + os.sep + document_path
documents.append(Document(full_path))
return documents