diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml index 745588daff..36f6b529d4 100644 --- a/conf/nutch-default.xml +++ b/conf/nutch-default.xml @@ -27,8 +27,8 @@ store.ip.address false - Enables us to capture the specific IP address - (InetSocketAddress) of the host which we connect to via + Enables us to capture the specific IP address + (InetSocketAddress) of the host which we connect to via the given protocol. Currently supported is protocol-ftp and http. @@ -45,7 +45,7 @@ confuse this setting with the http.content.limit setting. - + file.crawl.parent true @@ -84,7 +84,7 @@ http.agent.name - HTTP 'User-Agent' request header. MUST NOT be empty - + HTTP 'User-Agent' request header. MUST NOT be empty - please set this to a single word uniquely related to your organization. NOTE: You should also check other related properties: @@ -104,23 +104,23 @@ http.robots.agents Any other agents, apart from 'http.agent.name', that the robots - parser would look for in robots.txt. Multiple agents can be provided using + parser would look for in robots.txt. Multiple agents can be provided using comma as a delimiter. eg. mybot,foo-spider,bar-crawler - - The ordering of agents does NOT matter and the robots parser would make - decision based on the agent which matches first to the robots rules. - Also, there is NO need to add a wildcard (ie. "*") to this string as the - robots parser would smartly take care of a no-match situation. - - If no value is specified, by default HTTP agent (ie. 'http.agent.name') - would be used for user agent matching by the robots parser. + + The ordering of agents does NOT matter and the robots parser would make + decision based on the agent which matches first to the robots rules. + Also, there is NO need to add a wildcard (ie. "*") to this string as the + robots parser would smartly take care of a no-match situation. + + If no value is specified, by default HTTP agent (ie. 'http.agent.name') + would be used for user agent matching by the robots parser. http.robot.rules.whitelist - Comma separated list of hostnames or IP addresses to ignore + Comma separated list of hostnames or IP addresses to ignore robot rules parsing for. Use with care and only if you are explicitly allowed by the site owner to ignore the site's robots.txt! @@ -146,7 +146,7 @@ http.agent.url - A URL to advertise in the User-Agent header. This will + A URL to advertise in the User-Agent header. This will appear in parenthesis after the agent name. Custom dictates that this should be a URL of a page explaining the purpose and behavior of this crawler. @@ -165,7 +165,7 @@ http.agent.version Nutch-1.14-SNAPSHOT - A version string to advertise in the User-Agent + A version string to advertise in the User-Agent header. @@ -280,7 +280,7 @@ http.proxy.exception.list - A comma separated list of URL's and hosts that don't use the proxy + A comma separated list of URL's and hosts that don't use the proxy (e.g. intranets). Example: www.apache.org @@ -338,8 +338,8 @@ http.store.responsetime true - Enables us to record the response time of the - host which is the time period between start connection to end + Enables us to record the response time of the + host which is the time period between start connection to end connection of a pages host. The response time in milliseconds is stored in CrawlDb in CrawlDatum's meta data under key "_rs_" @@ -380,7 +380,7 @@ ftp.content.limit - 65536 + 65536 The length limit for downloaded content, in bytes. If this value is nonnegative (>=0), content longer than it will be truncated; otherwise, no truncation at all. @@ -454,7 +454,7 @@ The implementation of fetch schedule. DefaultFetchSchedule simply adds the original fetchInterval to the last fetch time, regardless of page changes, whereas AdaptiveFetchSchedule (see below) tries to adapt - to the rate at which a given page is changed. + to the rate at which a given page is changed. @@ -524,7 +524,7 @@ db.preserve.backup true If true, updatedb will keep a backup of the previous CrawlDB - version in the old directory. In case of disaster, one can rename old to + version in the old directory. In case of disaster, one can rename old to current and restore the CrawlDB to its previous state. @@ -562,8 +562,8 @@ db.update.max.inlinks 10000 - Maximum number of inlinks to take into account when updating - a URL score in the crawlDB. Only the best scoring inlinks are kept. + Maximum number of inlinks to take into account when updating + a URL score in the crawlDB. Only the best scoring inlinks are kept. @@ -673,7 +673,7 @@ db.parsemeta.to.crawldb Comma-separated list of parse metadata keys to transfer to the crawldb (NUTCH-779). - Assuming for instance that the languageidentifier plugin is enabled, setting the value to 'lang' + Assuming for instance that the languageidentifier plugin is enabled, setting the value to 'lang' will copy both the key 'lang' and its value to the corresponding entry in the crawldb. @@ -754,7 +754,7 @@ generate.count.mode host Determines how the URLs are counted for generator.max.count. - Default value is 'host' but can be 'domain'. Note that we do not count + Default value is 'host' but can be 'domain'. Note that we do not count per IP in the new version of the Generator. @@ -813,8 +813,8 @@ partition.url.mode byHost - Determines how to partition URLs. Default value is 'byHost', - also takes 'byDomain' or 'byIP'. + Determines how to partition URLs. Default value is 'byHost', + also takes 'byDomain' or 'byIP'. @@ -822,9 +822,9 @@ crawl.gen.delay 604800000 - This value, expressed in milliseconds, defines how long we should keep the lock on records - in CrawlDb that were just selected for fetching. If these records are not updated - in the meantime, the lock is canceled, i.e. they become eligible for selecting. + This value, expressed in milliseconds, defines how long we should keep the lock on records + in CrawlDb that were just selected for fetching. If these records are not updated + in the meantime, the lock is canceled, i.e. they become eligible for selecting. Default value of this is 7 days (604800000 ms). @@ -834,9 +834,9 @@ fetcher.server.delay 5.0 - The number of seconds the fetcher will delay between + The number of seconds the fetcher will delay between successive requests to the same server. Note that this might get - overridden by a Crawl-Delay from a robots.txt and is used ONLY if + overridden by a Crawl-Delay from a robots.txt and is used ONLY if fetcher.threads.per.queue is set to 1. @@ -844,7 +844,7 @@ fetcher.server.min.delay 0.0 - The minimum number of seconds the fetcher will delay between + The minimum number of seconds the fetcher will delay between successive requests to the same server. This value is applicable ONLY if fetcher.threads.per.queue is greater than 1 (i.e. the host blocking is turned off). @@ -860,7 +860,7 @@ amount of time retrieved from robots.txt Crawl-Delay, however long that might be. - + fetcher.threads.fetch @@ -877,10 +877,10 @@ fetcher.threads.per.queue 1 This number is the maximum number of threads that - should be allowed to access a queue at one time. Setting it to + should be allowed to access a queue at one time. Setting it to a value > 1 will cause the Crawl-Delay value from robots.txt to be ignored and the value of fetcher.server.min.delay to be used - as a delay between successive requests to the same server instead + as a delay between successive requests to the same server instead of fetcher.server.delay. @@ -888,8 +888,8 @@ fetcher.queue.mode byHost - Determines how to put URLs into queues. Default value is 'byHost', - also takes 'byDomain' or 'byIP'. + Determines how to put URLs into queues. Default value is 'byHost', + also takes 'byDomain' or 'byIP'. @@ -916,7 +916,7 @@ fetcher.timelimit.mins -1 This is the number of minutes allocated to the fetching. - Once this value is reached, any remaining entry from the input URL list is skipped + Once this value is reached, any remaining entry from the input URL list is skipped and all active queues are emptied. The default value of -1 deactivates the time limit. @@ -1008,7 +1008,7 @@ fetcher.follow.outlinks.ignore.external - true + true Whether to ignore or follow external links. Set db.ignore.external.links to false and this to true to store outlinks in the output but not follow them. If db.ignore.external.links is true this directive is ignored. @@ -1016,22 +1016,22 @@ fetcher.bandwidth.target - -1 - Target bandwidth in kilobits per sec for each mapper instance. This is used to adjust the number of + -1 + Target bandwidth in kilobits per sec for each mapper instance. This is used to adjust the number of fetching threads automatically (up to fetcher.maxNum.threads). A value of -1 deactivates the functionality, in which case the number of fetching threads is fixed (see fetcher.threads.fetch). fetcher.maxNum.threads - 25 + 25 Max number of fetch threads allowed when using fetcher.bandwidth.target. Defaults to fetcher.threads.fetch if unspecified or set to a value lower than it. fetcher.bandwidth.target.check.everyNSecs - 30 + 30 (EXPERT) Value in seconds which determines how frequently we should reassess the optimal number of fetch threads when using fetcher.bandwidth.target. Defaults to 30 and must be at least 1. @@ -1052,7 +1052,7 @@ false Set this value to true if you want to use an implementation of the Publisher/Subscriber model. Make sure to set corresponding Publisher implementation specific properties - + @@ -1098,7 +1098,7 @@ in given order. For example, if this property has value: org.apache.nutch.indexer.basic.BasicIndexingFilter org.apache.nutch.indexer.more.MoreIndexingFilter then BasicIndexingFilter is applied first, and MoreIndexingFilter second. - + Filter ordering might have impact on result if one filter depends on output of another filter. @@ -1191,7 +1191,7 @@ mime.types.file tika-mimetypes.xml Name of file in CLASSPATH containing filename extension and - magic sequence to mime types mapping information. Overrides the default Tika config + magic sequence to mime types mapping information. Overrides the default Tika config if specified. @@ -1230,8 +1230,8 @@ include. Any plugin not matching this expression is excluded. In any case you need at least include the nutch-extensionpoints plugin. By default Nutch includes crawling just HTML and plain text via HTTP, - and basic indexing and search plugins. In order to use HTTPS please enable - protocol-httpclient, but be aware of possible intermittent problems with the + and basic indexing and search plugins. In order to use HTTPS please enable + protocol-httpclient, but be aware of possible intermittent problems with the underlying commons-httpclient library. Set parsefilter-naivebayes for classification based focused crawler. @@ -1239,7 +1239,7 @@ plugin.excludes - Regular expression naming plugin directory names to exclude. + Regular expression naming plugin directory names to exclude. @@ -1252,7 +1252,7 @@ custom tags here will allow for their propagation into a pages outlinks, as well as allow for them to be included as part of an index. Values should be comma-delimited. ("tag1,tag2,tag3") Do not pad the tags with - white-space at their boundaries, if you are using anything earlier than Hadoop-0.21. + white-space at their boundaries, if you are using anything earlier than Hadoop-0.21. @@ -1310,8 +1310,8 @@ parser.html.outlinks.ignore_tags - Comma separated list of HTML tags, from which outlinks - shouldn't be extracted. Nutch takes links from: a, area, form, frame, + Comma separated list of HTML tags, from which outlinks + shouldn't be extracted. Nutch takes links from: a, area, form, frame, iframe, script, link, img. If you add any of those tags here, it won't be taken. Default is empty list. Probably reasonable value for most people would be "img,script,link". @@ -1340,7 +1340,7 @@ parsefilter.naivebayes.trainfile naivebayes-train.txt - Set the name of the file to be used for Naive Bayes training. The format will be: + Set the name of the file to be used for Naive Bayes training. The format will be: Each line contains two tab separated parts There are two columns/parts: 1. "1" or "0", "1" for relevant and "0" for irrelevant documents. @@ -1354,7 +1354,7 @@ CAUTION: Set the parser.timeout to -1 or a bigger value than 30, when using this parsefilter.naivebayes.wordlist naivebayes-wordlist.txt - Put the name of the file you want to be used as a list of + Put the name of the file you want to be used as a list of important words to be matched in the url for the model filter. The format should be one word per line. @@ -1362,8 +1362,8 @@ CAUTION: Set the parser.timeout to -1 or a bigger value than 30, when using this parser.timeout 30 - Timeout in seconds for the parsing of a document, otherwise treats it as an exception and - moves on the the following documents. This parameter is applied to any Parser implementation. + Timeout in seconds for the parsing of a document, otherwise treats it as an exception and + moves on the the following documents. This parameter is applied to any Parser implementation. Set to -1 to deactivate, bearing in mind that this could cause the parsing to crash because of a very long or corrupted document. @@ -1384,8 +1384,8 @@ CAUTION: Set the parser.timeout to -1 or a bigger value than 30, when using this parser.skip.truncated true - Boolean value for whether we should skip parsing for truncated documents. By default this - property is activated due to extremely high levels of CPU which parsing can sometimes take. + Boolean value for whether we should skip parsing for truncated documents. By default this + property is activated due to extremely high levels of CPU which parsing can sometimes take. @@ -1422,10 +1422,10 @@ CAUTION: Set the parser.timeout to -1 or a bigger value than 30, when using this - + tika.extractor.boilerpipe.algorithm ArticleExtractor - + Which Boilerpipe algorithm to use. Valid values are: DefaultExtractor, ArticleExtractor or CanolaExtractor. @@ -1507,7 +1507,7 @@ CAUTION: Set the parser.timeout to -1 or a bigger value than 30, when using this Max depth value from seed allowed by default. Can be overridden on a per-seed basis by specifying "_maxdepth_=VALUE" as a seed metadata. This plugin adds a "_depth_" metadatum to the pages - to track the distance from the seed it was found from. + to track the distance from the seed it was found from. The depth is used to prioritise URLs in the generation step so that shallower pages are fetched first. @@ -1515,15 +1515,15 @@ CAUTION: Set the parser.timeout to -1 or a bigger value than 30, when using this scoring.similarity.model cosine The type of similarity metric to use. Eg - cosine (which is, currently, the only available model). - Please make sure to set the model specific properties for the scoring to function properly. + Please make sure to set the model specific properties for the scoring to function properly. Description of these properties can be found on the wiki. @@ -1539,7 +1539,7 @@ visit https://wiki.apache.org/nutch/SimilarityScoringFilter--> cosine.goldstandard.file goldstandard.txt - Path to the gold standard file which contains all the relevant text and terms, + Path to the gold standard file which contains all the relevant text and terms, pertaining to the domain. @@ -1547,7 +1547,7 @@ visit https://wiki.apache.org/nutch/SimilarityScoringFilter--> scoring.similarity.stopword.file stopwords.txt - Name of the stopword text file. The user can specify a custom list of stop words + Name of the stopword text file. The user can specify a custom list of stop words in a text file. Each new stopword should be on a new line. @@ -1614,11 +1614,11 @@ visit https://wiki.apache.org/nutch/SimilarityScoringFilter--> index.static - Used by plugin index-static to adds fields with static data at indexing time. + Used by plugin index-static to adds fields with static data at indexing time. You can specify a comma-separated list of fieldname:fieldcontent per Nutch job. Each fieldcontent can have multiple values separated by space, e.g., field1:value1.1 value1.2 value1.3,field2:value2.1 value2.2 ... - It can be useful when collections can't be created by URL patterns, + It can be useful when collections can't be created by URL patterns, like in subcollection, but on a job-basis. @@ -1659,7 +1659,7 @@ visit https://wiki.apache.org/nutch/SimilarityScoringFilter--> Comma-separated list of keys to be taken from the parse metadata to generate fields. Can be used e.g. for 'description' or 'keywords' provided that these values are generated - by a parser (see parse-metatags plugin) + by a parser (see parse-metatags plugin) @@ -1667,7 +1667,7 @@ visit https://wiki.apache.org/nutch/SimilarityScoringFilter--> index.content.md - Comma-separated list of keys to be taken from the content metadata to generate fields. + Comma-separated list of keys to be taken from the content metadata to generate fields. @@ -1676,7 +1676,7 @@ visit https://wiki.apache.org/nutch/SimilarityScoringFilter--> Comma-separated list of keys to be taken from the crawldb metadata to generate fields. - Can be used to index values propagated from the seeds with the plugin urlmeta + Can be used to index values propagated from the seeds with the plugin urlmeta @@ -1686,9 +1686,9 @@ visit https://wiki.apache.org/nutch/SimilarityScoringFilter--> insightsService A string representing the information source to be used for GeoIP information - association. Either enter 'cityDatabase', 'connectionTypeDatabase', - 'domainDatabase', 'ispDatabase' or 'insightsService'. If you wish to use any one of the - Database options, you should make one of GeoIP2-City.mmdb, GeoIP2-Connection-Type.mmdb, + association. Either enter 'cityDatabase', 'connectionTypeDatabase', + 'domainDatabase', 'ispDatabase' or 'insightsService'. If you wish to use any one of the + Database options, you should make one of GeoIP2-City.mmdb, GeoIP2-Connection-Type.mmdb, GeoIP2-Domain.mmdb or GeoIP2-ISP.mmdb files respectively available on the classpath and available at runtime. @@ -1733,8 +1733,8 @@ visit https://wiki.apache.org/nutch/SimilarityScoringFilter--> description,keywords Names of the metatags to extract, separated by ','. Use '*' to extract all metatags. Prefixes the names with 'metatag.' - in the parse-metadata. For instance to index description and keywords, - you need to activate the plugin index-metadata and set the value of the + in the parse-metadata. For instance to index description and keywords, + you need to activate the plugin index-metadata and set the value of the parameter 'index.parse.md' to 'metatag.description,metatag.keywords'. @@ -1786,7 +1786,7 @@ visit https://wiki.apache.org/nutch/SimilarityScoringFilter--> link.ignore.limit.domain true Limit to only a single outlink to the same domain. - + link.analyze.num.iterations @@ -1812,7 +1812,7 @@ visit https://wiki.apache.org/nutch/SimilarityScoringFilter--> Whether to delete gone pages from the web graph. - + link.loops.depth 2 The depth for the loops algorithm. @@ -1827,7 +1827,7 @@ visit https://wiki.apache.org/nutch/SimilarityScoringFilter--> mapreduce.fileoutputcommitter.marksuccessfuljobs false - Hadoop >= 0.21 generates SUCCESS files in the output which can crash + Hadoop >= 0.21 generates SUCCESS files in the output which can crash the readers. This should not be an issue once Nutch is ported to the new MapReduce API but for now this parameter should prevent such cases. @@ -1841,7 +1841,7 @@ visit https://wiki.apache.org/nutch/SimilarityScoringFilter--> Specifies the SolrServer implementation to use. This is a string value of one of the following 'cloud', 'concurrent', 'http' or 'lb'. - The values represent CloudSolrServer, ConcurrentUpdateSolrServer, + The values represent CloudSolrServer, ConcurrentUpdateSolrServer, HttpSolrServer or LBHttpSolrServer respectively. @@ -1859,7 +1859,7 @@ visit https://wiki.apache.org/nutch/SimilarityScoringFilter--> solr.zookeeper.url - Defines the Zookeeper URL which is an essential setting to be used + Defines the Zookeeper URL which is an essential setting to be used when using SolrCloud. This should be a fully qualified URL similar to the property provided within 'solr.server.url' above. @@ -1870,7 +1870,7 @@ visit https://wiki.apache.org/nutch/SimilarityScoringFilter--> A comma-separated value representing the Solr servers to be used when - initiating LBHttpSolrServer as the SolrServer implementation. + initiating LBHttpSolrServer as the SolrServer implementation. @@ -1883,7 +1883,7 @@ visit https://wiki.apache.org/nutch/SimilarityScoringFilter--> - + solr.commit.size 250 @@ -1897,7 +1897,7 @@ visit https://wiki.apache.org/nutch/SimilarityScoringFilter--> solr.commit.index true - When closing the indexer, trigger a commit to the Solr server. + When closing the indexer, trigger a commit to the Solr server. @@ -1920,34 +1920,34 @@ visit https://wiki.apache.org/nutch/SimilarityScoringFilter--> TransportClient. Either host and port must be defined or cluster. - + elastic.port 9300 The port to connect to using TransportClient. - + elastic.cluster The cluster name to discover. Either host and port must be defined or cluster. - + elastic.index - nutch + nutch Default index to send documents to. - + elastic.max.bulk.docs - 250 + 250 Maximum size of the bulk in number of documents. - + elastic.max.bulk.size - 2500500 + 2500500 Maximum size of the bulk in bytes. @@ -2175,7 +2175,7 @@ visit https://wiki.apache.org/nutch/SimilarityScoringFilter--> page.load.delay 3 - The delay in seconds to use when loading a page with htmlunit or selenium. + The delay in seconds to use when loading a page with htmlunit or selenium. @@ -2210,7 +2210,7 @@ visit https://wiki.apache.org/nutch/SimilarityScoringFilter--> true A Boolean value representing if javascript should - be enabled or disabled when using htmlunit. The default value is enabled. + be enabled or disabled when using htmlunit. The default value is enabled. @@ -2219,7 +2219,7 @@ visit https://wiki.apache.org/nutch/SimilarityScoringFilter--> 3500 The timeout in milliseconds when loading javascript with lib-htmlunit. This - setting is used by protocol-htmlunit since they depending on + setting is used by protocol-htmlunit since they depending on lib-htmlunit for fetching. @@ -2239,7 +2239,7 @@ visit https://wiki.apache.org/nutch/SimilarityScoringFilter--> selenium.driver firefox - A String value representing the flavour of Selenium + A String value representing the flavour of Selenium WebDriver() to use. Currently the following options exist - 'firefox', 'chrome', 'safari', 'opera', 'phantomjs' and 'remote'. If 'remote' is used it is essential to also set correct properties for @@ -2275,7 +2275,7 @@ visit https://wiki.apache.org/nutch/SimilarityScoringFilter--> selenium.grid.driver firefox - A String value representing the flavour of Selenium + A String value representing the flavour of Selenium WebDriver() used on the selenium grid. Currently the following options exist - 'firefox', 'phantomjs' @@ -2283,18 +2283,18 @@ visit https://wiki.apache.org/nutch/SimilarityScoringFilter--> selenium.grid.binary - A String value representing the path to the browser binary + A String value representing the path to the browser binary location for each node - selenium.firefox.allowed.hosts localhost A String value representing the allowed hosts preference - according to the operating system hosts file (Example - /etc/hosts in Unix). + according to the operating system hosts file (Example - /etc/hosts in Unix). Currently this option exist for - 'firefox' @@ -2302,7 +2302,7 @@ visit https://wiki.apache.org/nutch/SimilarityScoringFilter--> selenium.firefox.binary.timeout 45 A Long value representing the timeout value - for firefox to be available for command execution. The value is in seconds. + for firefox to be available for command execution. The value is in seconds. Currently this option exist for - 'firefox' @@ -2310,7 +2310,7 @@ visit https://wiki.apache.org/nutch/SimilarityScoringFilter--> selenium.firefox.enable.flash false A Boolean value representing if flash should - be enabled or disabled. The default value is disabled. + be enabled or disabled. The default value is disabled. Currently this option exist for - 'firefox' @@ -2322,7 +2322,7 @@ visit https://wiki.apache.org/nutch/SimilarityScoringFilter--> Other options are: 1: Load all images, regardless of origin 2: Block all images - 3: Prevent third-party images from loading + 3: Prevent third-party images from loading Currently this option exist for - 'firefox' @@ -2330,7 +2330,7 @@ visit https://wiki.apache.org/nutch/SimilarityScoringFilter--> selenium.firefox.load.stylesheet 1 An Integer value representing the restriction on - loading stylesheet. The default value is no restriction i.e. load + loading stylesheet. The default value is no restriction i.e. load all stylesheet. Other options are: 1: Load all stylesheet @@ -2338,6 +2338,16 @@ visit https://wiki.apache.org/nutch/SimilarityScoringFilter--> Currently this option exist for - 'firefox' + + selenium.firefox.headless + false + A Boolean value representing if firefox should + run headless . make sure that firefox version is 55 or later, + and selenium webDriver version is 3.6.0 or later. The default value is false. + Currently this option exist for - 'firefox' + + + interactiveselenium.handlers @@ -2362,7 +2372,7 @@ visit https://wiki.apache.org/nutch/SimilarityScoringFilter--> store.http.headers false - Store the raw headers received by Nutch from the server, required to use the + Store the raw headers received by Nutch from the server, required to use the CommonCrawlDataDumper tool for the WARC format. @@ -2373,7 +2383,7 @@ visit https://wiki.apache.org/nutch/SimilarityScoringFilter--> index.links.outlinks.host.ignore false - Ignore outlinks that point out to the same host as the URL being indexed. + Ignore outlinks that point out to the same host as the URL being indexed. By default all outlinks are indexed. If db.ignore.internal.links is true (default value), this setting does nothing since the internal links are already ignored. @@ -2384,7 +2394,7 @@ visit https://wiki.apache.org/nutch/SimilarityScoringFilter--> index.links.inlinks.host.ignore false - Ignore inlinks coming from the same host as the URL being indexed. By default + Ignore inlinks coming from the same host as the URL being indexed. By default all inlinks are indexed. If db.ignore.internal.links is true (default value), this setting does nothing since the internal links are already ignored. @@ -2508,15 +2518,15 @@ visit https://wiki.apache.org/nutch/SimilarityScoringFilter--> - publisher.queue.type - Choose the type of Queue being used (ex - RabbitMQ, ActiveMq, Kafka, etc). - Currently there exists an implemtation for RabbitMQ producer. + Choose the type of Queue being used (ex - RabbitMQ, ActiveMq, Kafka, etc). + Currently there exists an implemtation for RabbitMQ producer. @@ -2607,7 +2617,7 @@ visit https://wiki.apache.org/nutch/SimilarityScoringFilter--> Default is 'fanout.key' - The routingKey used by publisher to publish messages to specific queues. + The routingKey used by publisher to publish messages to specific queues. If the exchange type is "fanout", then this property is ignored. diff --git a/src/plugin/lib-selenium/ivy.xml b/src/plugin/lib-selenium/ivy.xml index 701b7257d3..8cd28237e7 100644 --- a/src/plugin/lib-selenium/ivy.xml +++ b/src/plugin/lib-selenium/ivy.xml @@ -22,7 +22,7 @@ - Apache Nutch + Apache Nutch @@ -37,16 +37,12 @@ - - - + + - - - - + - + diff --git a/src/plugin/lib-selenium/plugin.xml b/src/plugin/lib-selenium/plugin.xml index a86d665dc4..b062a6179b 100644 --- a/src/plugin/lib-selenium/plugin.xml +++ b/src/plugin/lib-selenium/plugin.xml @@ -19,157 +19,84 @@ ! A common framework for http protocol implementations !--> + id="lib-selenium" + name="HTTP Framework" + version="1.0" + provider-name="org.apache.nutch"> - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java b/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java index 6e137f9bf0..671c91ec05 100644 --- a/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java +++ b/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java @@ -37,6 +37,7 @@ import org.openqa.selenium.chrome.ChromeDriver; import org.openqa.selenium.firefox.FirefoxBinary; import org.openqa.selenium.firefox.FirefoxDriver; +import org.openqa.selenium.firefox.FirefoxOptions; import org.openqa.selenium.firefox.FirefoxProfile; import org.openqa.selenium.io.TemporaryFilesystem; import org.openqa.selenium.remote.DesiredCapabilities; @@ -46,9 +47,8 @@ import org.openqa.selenium.phantomjs.PhantomJSDriverService; import org.slf4j.Logger; import org.slf4j.LoggerFactory; - -import com.opera.core.systems.OperaDriver; - +import org.openqa.selenium.opera.OperaDriver; +import org.openqa.selenium.opera.OperaOptions; public class HttpWebClient { private static final Logger LOG = LoggerFactory @@ -59,12 +59,14 @@ public class HttpWebClient { @Override protected WebDriver initialValue() { + FirefoxOptions options = new FirefoxOptions(); FirefoxProfile profile = new FirefoxProfile(); profile.setPreference("permissions.default.stylesheet", 2); profile.setPreference("permissions.default.image", 2); profile.setPreference("dom.ipc.plugins.enabled.libflashplayer.so", "false"); profile.setPreference(FirefoxProfile.ALLOWED_HOSTS_PREFERENCE, "localhost"); - WebDriver driver = new FirefoxDriver(profile); + options.setProfile(profile); + WebDriver driver = new FirefoxDriver(options); return driver; }; }; @@ -81,16 +83,23 @@ public static WebDriver getDriverForPage(String url, Configuration conf) { String allowedHost = conf.get("selenium.firefox.allowed.hosts", "localhost"); long firefoxBinaryTimeout = conf.getLong("selenium.firefox.binary.timeout", 45); boolean enableFlashPlayer = conf.getBoolean("selenium.firefox.enable.flash", false); + boolean headless = conf.getBoolean("selenium.firefox.headless", false); int loadImage = conf.getInt("selenium.firefox.load.image", 1); int loadStylesheet = conf.getInt("selenium.firefox.load.stylesheet", 1); - FirefoxProfile profile = new FirefoxProfile(); - FirefoxBinary binary = new FirefoxBinary(); - profile.setPreference(FirefoxProfile.ALLOWED_HOSTS_PREFERENCE, allowedHost); - profile.setPreference("dom.ipc.plugins.enabled.libflashplayer.so", enableFlashPlayer); - profile.setPreference("permissions.default.stylesheet", loadStylesheet); - profile.setPreference("permissions.default.image", loadImage); - binary.setTimeout(TimeUnit.SECONDS.toMillis(firefoxBinaryTimeout)); - driver = new FirefoxDriver(binary, profile); + FirefoxOptions options = new FirefoxOptions(); + FirefoxProfile profile = new FirefoxProfile(); + FirefoxBinary binary = new FirefoxBinary(); + if(headless) { + binary.addCommandLineOptions("--headless"); + } + profile.setPreference(FirefoxProfile.ALLOWED_HOSTS_PREFERENCE, allowedHost); + profile.setPreference("dom.ipc.plugins.enabled.libflashplayer.so", enableFlashPlayer); + profile.setPreference("permissions.default.stylesheet", loadStylesheet); + profile.setPreference("permissions.default.image", loadImage); + binary.setTimeout(TimeUnit.SECONDS.toMillis(firefoxBinaryTimeout)); + options.setProfile(profile); + options.setBinary(binary); + driver = new FirefoxDriver(options); break; case "chrome": driver = new ChromeDriver(); @@ -166,7 +175,7 @@ public static String getHTMLContent(WebDriver driver, Configuration conf) { public static void cleanUpDriver(WebDriver driver) { if (driver != null) { try { - driver.close(); +// driver.close(); driver.quit(); TemporaryFilesystem.getDefaultTmpFS().deleteTemporaryFiles(); } catch (Exception e) { diff --git a/src/plugin/protocol-interactiveselenium/README.md b/src/plugin/protocol-interactiveselenium/README.md index dd43ee7948..95707f0308 100644 --- a/src/plugin/protocol-interactiveselenium/README.md +++ b/src/plugin/protocol-interactiveselenium/README.md @@ -36,3 +36,12 @@ Only basic functionality is included in the DefaultHandler that comes with the p # Handler Info Handlers are called in the order that they're specified in the configuration. A "clean" driver is used for each handler so multiple handlers won't interfere with each other. Page content is appended together from each handler and returned for the request. + + +# Using headless mode + +headless option is added recently for firefox and chrome (firefox version >= 55 , chrome version >= 59) . +we used to rely on xvfb and its associates for running in headless mode, now that the option is available it's better that we use it. +you can use this option by setting the selenium.firefox.headless to True, default value is false . +this option is tested using firefox 57.0 , gecodriver 0.19.1 and selenium 3.7.1 +Currently this option exist for - 'firefox' .