-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathconf.pl
executable file
·313 lines (259 loc) · 12.9 KB
/
conf.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
# Perlfect Search configuration file
#$rcs = ' $Id: conf.pl,v 1.74 2007/03/30 22:55:03 gzervas Exp $' ;
# NOTE: Whenever you change one of the options that's marked with [re-index]
# you need to run indexer.pl again to make the change take effect.
###########################################################################
### basic configuration
### You'll have to adapt these values if you didn't use setup.pl
# Where do you want the indexer to start on your disk?
# ** Note ** : If your files are generated dynamically (e.g. via PHP)
# you should set $HTTP_START_URL (see below), otherwise users
# will be able to see your pages' source code using the
# "highlight matches" link.
# [re-index]
$DOCUMENT_ROOT = '/home/perlfect/perlfect.com/html/';
# The base url of your site (normally that's the URL which
# corresponds to $DOCUMENT_ROOT).
$BASE_URL = 'http://localhost';
# The url in which Perlfect Search is located (usually somewhere in cgi-bin/).
$CGIBIN = "/cgi-bin/search/";
# The full-path of the directory where Perlfect Search is installed.
$INSTALL_DIR = '/home/perlfect/perlfect.com/cgi-bin/search/';
# Only files with these extensions should be indexed (case-sensitive).
# This is only relevant for file system indexing, when you index files via
# http you need to set @HTTP_CONTENT_TYPES instead. [re-index]
@EXT = ("html", "htm", "shtml");
# If you do not have telnet/ssh access to the server that runs the script, you
# need to execute indexer.pl via CGI. Of course not everybody should be able
# to do that, so set a password with this option.
# ** Note ** : Only use this if absolutely necessary! Setting to "" disables
# execution as a CGI, which is much more secure. Note that other people on
# your server can probably read this file and look up your password.
$INDEXER_CGI_PASSWORD = "";
###########################################################################
### http configuration
### You only need this if you want to index your pages via http
# Where you want the indexer to start via http. Leave empty if
# you want to index the files in the filesystem ($DOCUMENT_ROOT).
# ** WARNING **: Do not use for foreign servers! It might use too many
# resources on other people's servers. [re-index]
# example: $HTTP_START_URL = 'http://localhost/';
$HTTP_START_URL = '';
# The indexer might not notice if it runs into an endless loop. To void
# that, set this to the maximum number of pages that will be visited
# (this can be bigger than the number of pages indexed). [re-index]
$HTTP_MAX_PAGES = 100;
# The web server's document root. Normally that's the same as $DOCUMENT_ROOT,
# it differs if you're only using Perlfect Search on a subdirectory. [re-index]
$HTTP_SERVER_ROOT = $DOCUMENT_ROOT;
# Limit crawling to these URL pattern. This is an important setting so
# the script doesn't run out of control.
# ** WARNING **: The default ($HTTP_START_URL) should not be changed,
# otherwise you risk the script to crawl on remote servers. For example,
# the robots.txt file will only be used on the $HTTP_START_URL server!
# [re-index]
@HTTP_LIMIT_URLS = ($HTTP_START_URL);
# Comment this out if you want to ignore robots.txt (only do that if
# you really know what you are doing):
$ROBOT_AGENT = 'perlfectsearch';
# Should the indexer follow links that are commented out?
$HTTP_FOLLOW_COMMENT_LINKS = 1;
# Only if indexing via http: the content types to index.
# Add 'application/msword' for for MS-Word,
# 'application/pdf' for PDF. [re-index]
@HTTP_CONTENT_TYPES = ('text/html', 'text/plain');
# Set to 1 to get verbose output during indexing. [re-index]
$HTTP_DEBUG = 1;
###########################################################################
### advanced configuration
### You only need this if you want to adapt advanced features
# Programs that convert other formats to ascii text.
# The name of the file to be filtered is passed as FILENAME, and the command
# must print out ascii (or latin1) text.
# pdftotext is part of xpdf, available at
# http://www.foolabs.com/xpdf/download.html
# antiword is available at http://www.winfield.demon.nl/
# NOTE: You also have to set @EXT or @HTTP_CONTENT_TYPES accordingly.
# If there's a problem with pdftotext, try a new version or hand over
# the -raw option to pdftotext.
# [re-index]
%EXT_FILTER = (
"pdf" => "/usr/bin/pdftotext FILENAME -enc UTF-8 -",
"doc" => "/usr/bin/antiword FILENAME"
);
# How many results should be shown per page.
$RESULTS_PER_PAGE = 10;
# Limit the number of results. 0 = no limit.
$MAX_RESULTS = 0;
# Enable the "highlight matches" feature that displays the original
# pages, but with the search terms highlighted. See the README on
# restrictions of this feature.
$HIGHLIGHT_MATCHES = 1;
# A "highlight matches" link does only work for HTML files, so only
# offer such a link for files with these suffixes.
# ** Note **: If $HTTP_START_URL is not set, the highlighting
# will load the file from disk so that the user might find
# passwords in the highlightes file! So don't set this to include
# dynamic files, unless you are using $HTTP_START_URL.
@HIGHLIGHT_EXT = ("html", "htm");
# Perlfect Search can highlight the search terms in the matching
# document. These are the colors that will be used for the background
# of the terms (the browser must support CSS for this). If the last color
# is used, the first one will be used again if there are still terms left.
@HIGHLIGHT_COLORS = ('#4fafea', '#e5b547', '#aaaaaa', '#ee77ee');
# Show the ranking in percent, with the first document = 100%.
$PERCENTAGE_RANKING = 1;
# Do you want to index numbers? If so set $INDEX_NUMBERS to 1. [re-index]
$INDEX_NUMBERS = 0;
# If you don't have enough memory, set this to 1. This will slow down
# indexer.pl by a factor of about 2. Searching is not affected.
$LOW_MEMORY_INDEX = 1;
# How much of the document should be put in the index? With this option,
# the context of the match is shown on the results page. This only works
# if the match was in the first $CONTEXT_SIZE bytes of the document.
# Warning: Using this option will generate a very big index file.
# Set to 0 to disable, set to -1 for no limit. [re-index]
$CONTEXT_SIZE = 0;
# If $CONTEXT_SIZE is enabled, how many occurences of every term should be shown
# on the results page?
$CONTEXT_EXAMPLES = 2;
# If $CONTEXT_SIZE is enabled, how many words should be used to show the context
# of a term?
$CONTEXT_DESC_WORDS = 12;
# How many words should be used from the <BODY> of an html document as a
# description for the document in case there is no <META description> tag
# available and $CONTEXT_SIZE is 0. [re-index]
$DESC_WORDS = 25;
# The minimum length of a word. Any word of smaller size is not indexed.
# [re-index]
$MINLENGTH = 3;
# Since UTF-8 is used for searching and displaying results now the
# SPECIAL_CHARACTERS value should always be ZERO!
$SPECIAL_CHARACTERS = 0;
# The largest acceptable word size. Reducing this saves space but decreases
# result accuracy. Setting the variable to 0 ignores stemming alltogether.
# [re-index]
$STEMCHARS = 0;
# Add URLs to the index, so one can search for them? Note that special
# characters will be ignored, just as in normal text. [re-index]
$INDEX_URLS = 0;
# You can completely ignore certain parts of your documents if you put these
# HTML comments around them. [re-index]
$IGNORE_TEXT_START = '<!--ignore_perlfect_search-->';
$IGNORE_TEXT_END = '<!--/ignore_perlfect_search-->';
# The maximum length of <title> elements, everything longer than this
# will be cut off. [re-index]
$MAX_TITLE_LENGTH = 80;
# How much more important are words found in the title, in the meta values
# (author, description, keywords), and in the headlines compared to normal
# text in the body? This influences the ranking of the results.
# Use any integer (0 = ignore that text completely) [re-index]
$TITLE_WEIGHT = 5;
$META_WEIGHT = 5;
$H_WEIGHT{'1'} = 5; # headline <h1>...</h1>
$H_WEIGHT{'2'} = 4;
$H_WEIGHT{'3'} = 3;
$H_WEIGHT{'4'} = 1;
$H_WEIGHT{'5'} = 1;
$H_WEIGHT{'6'} = 1; # headline <h6>...</h6>
# If you want to log the queries to an extra file, set this to 1.
# Every use of search.pl will then be logged to data/log.txt. That file
# has to exist and must be writable for the webserver. The line format is:
# REMOTE_HOST;date;terms;matches;current page;(time to search in seconds);
# NOTE: You'll have to comment in two lines at the top of search.pl to get the
# time value (see the comment there).
# NOTE: if you have many queries, this file will grow quite fast.
$LOG = 0;
# This will increase the score of results that contain more than one of
# the searched terms. Queries with only one term will not be affected.
# The number given here is a factor that multiplies the score (even
# several times, if there are more than two terms). 0 turns it off.
$MULTIPLE_MATCH_BOOST = 0;
# Date format for the result page. %Y = year, %m = month, %d = day,
# %H = hour, %M = minute, %S = second. On a Unix system use
# 'man strftime' to get a list of all possible options.
$DATE_FORMAT = "%Y-%m-%d";
# Date format for the "Latest Index update" information on the result page.
$INDEX_DATE_FORMAT = "%Y-%m-%d %H:%M";
# Directory with templates (normally you don't have to modify this).
$TEMPLATE_DIR = $INSTALL_DIR.'templates/';
# What's the default language. This is the language that's used if no lang
# parameter is passed to the script or if the parameter is invalid.
$DEFAULT_LANG = 'en';
# The result templates for several languages.
$SEARCH_TEMPLATE{'en'} = $TEMPLATE_DIR.'search.html';
$SEARCH_TEMPLATE{'de'} = $TEMPLATE_DIR.'search_de.html';
$SEARCH_TEMPLATE{'fr'} = $TEMPLATE_DIR.'search_fr.html';
$SEARCH_TEMPLATE{'it'} = $TEMPLATE_DIR.'search_it.html';
$NO_MATCH_TEMPLATE{'en'} = $TEMPLATE_DIR.'no_match.html';
$NO_MATCH_TEMPLATE{'de'} = $TEMPLATE_DIR.'no_match_de.html';
$NO_MATCH_TEMPLATE{'fr'} = $TEMPLATE_DIR.'no_match_fr.html';
$NO_MATCH_TEMPLATE{'it'} = $TEMPLATE_DIR.'no_match_it.html';
# This is the template for using search.pl via command line:
$SEARCH_TEMPLATE{'text'} = $TEMPLATE_DIR.'search.txt';
$NO_MATCH_TEMPLATE{'text'} = $TEMPLATE_DIR.'no_match.txt';
# This is the template for using the test cases (development only):
$SEARCH_TEMPLATE{'qa'} = $INSTALL_DIR.'qa/search_qa.txt';
$NO_MATCH_TEMPLATE{'qa'} = $INSTALL_DIR.'qa/no_match_qa.txt';
# The text for the "Next Page" link in several languages.
$NEXT_PAGE{'en'} = 'Next';
$NEXT_PAGE{'de'} = 'nächste Seite';
$NEXT_PAGE{'fr'} = 'Suivant';
$NEXT_PAGE{'it'} = 'Successiva';
# The text for the "Previous Page" link in several languages.
$PREV_PAGE{'en'} = 'Previous';
$PREV_PAGE{'de'} = 'vorige Seite';
$PREV_PAGE{'fr'} = 'Précédent';
$PREV_PAGE{'it'} = 'Precedente';
# Text of the link that shows a colored backround for matched terms:
$HIGHLIGHT_TERMS{'en'} = 'highlight matches';
$HIGHLIGHT_TERMS{'de'} = 'Treffer hervorheben';
# The text for the "too common" warning. <WORDS> will be replaced with
# a list of the ignored words. If there are no ignored words, this text
# will not appear.
$IGNORED_WORDS{'en'} = '<p>The following words are either too short or very common and were
not included in your search: <strong><WORDS></strong></p>';
$IGNORED_WORDS{'de'} = '<p>Folgende Wörter sind zu kurz oder kommen sehr häufig vor und wurden
daher in Ihrer Suchanfrage ignoriert: <strong><WORDS></strong></p>';
$IGNORED_WORDS{'fr'} = '<p>Les mots suivants sont trop courts ou très courants et n\'ont
pas été inclus dans votre recherche: <strong><WORDS></strong></p>';
# fixme: "too short" missing:
$IGNORED_WORDS{'it'} = '<p>Le seguenti parole sono molto comuni e non
saranno incluse nella vostra ricerca: <strong><WORDS></strong></p>';
###########################################################################
### You shouldn't have to edit anything below this line.
# Various paths (do NOT use system-wide /tmp for security reasons!)
$TMP_DIR = $INSTALL_DIR.'temp/';
$DATA_DIR = $INSTALL_DIR.'data/';
$CONF_DIR = $INSTALL_DIR."conf/";
$STOPWORDS_FILE = $CONF_DIR.'stopwords.txt';
$NO_INDEX_FILE = $CONF_DIR.'no_index.txt';
$LOGFILE = $DATA_DIR.'log.txt';
$SEARCH = 'search.pl';
$SEARCH_URL = $CGIBIN.$SEARCH;
$UPDATE_FILE = $DATA_DIR.'update';
# Paths to the database files.
$INV_INDEX_DB_FILE = $DATA_DIR.'inv_index';
$DOCS_DB_FILE = $DATA_DIR.'docs';
$URLS_DB_FILE = $DATA_DIR.'urls';
$SIZES_DB_FILE = $DATA_DIR.'sizes';
$TERMS_DB_FILE = $DATA_DIR.'terms';
$DF_DB_FILE = $DATA_DIR.'df';
$TF_DB_FILE = $DATA_DIR.'tf';
$CONTENT_DB_FILE = $DATA_DIR.'content';
$DESC_DB_FILE = $DATA_DIR.'desc';
$TITLES_DB_FILE = $DATA_DIR.'titles';
$DATES_DB_FILE = $DATA_DIR.'dates';
# Paths to the temporary database files.
$INV_INDEX_TMP_DB_FILE = $DATA_DIR.'inv_index_tmp';
$DOCS_TMP_DB_FILE = $DATA_DIR.'docs_tmp';
$URLS_TMP_DB_FILE = $DATA_DIR.'urls_tmp';
$SIZES_TMP_DB_FILE = $DATA_DIR.'sizes_tmp';
$TERMS_TMP_DB_FILE = $DATA_DIR.'terms_tmp';
$CONTENT_TMP_DB_FILE = $DATA_DIR.'content_tmp';
$DESC_TMP_DB_FILE = $DATA_DIR.'desc_tmp';
$TITLES_TMP_DB_FILE = $DATA_DIR.'titles_tmp';
$DATES_TMP_DB_FILE = $DATA_DIR.'dates_tmp';
# Official version number.
$VERSION = "3.37";
1;