diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml
index 745588daff..36f6b529d4 100644
--- a/conf/nutch-default.xml
+++ b/conf/nutch-default.xml
@@ -27,8 +27,8 @@
 <property>
   <name>store.ip.address</name>
   <value>false</value>
-  <description>Enables us to capture the specific IP address 
-  (InetSocketAddress) of the host which we connect to via 
+  <description>Enables us to capture the specific IP address
+  (InetSocketAddress) of the host which we connect to via
   the given protocol. Currently supported is protocol-ftp and
   http.
   </description>
@@ -45,7 +45,7 @@
   confuse this setting with the http.content.limit setting.
   </description>
 </property>
-  
+
 <property>
   <name>file.crawl.parent</name>
   <value>true</value>
@@ -84,7 +84,7 @@
 <property>
   <name>http.agent.name</name>
   <value></value>
-  <description>HTTP 'User-Agent' request header. MUST NOT be empty - 
+  <description>HTTP 'User-Agent' request header. MUST NOT be empty -
   please set this to a single word uniquely related to your organization.
 
   NOTE: You should also check other related properties:
@@ -104,23 +104,23 @@
   <name>http.robots.agents</name>
   <value></value>
   <description>Any other agents, apart from 'http.agent.name', that the robots
-  parser would look for in robots.txt. Multiple agents can be provided using 
+  parser would look for in robots.txt. Multiple agents can be provided using
   comma as a delimiter. eg. mybot,foo-spider,bar-crawler
-  
-  The ordering of agents does NOT matter and the robots parser would make 
-  decision based on the agent which matches first to the robots rules.  
-  Also, there is NO need to add a wildcard (ie. "*") to this string as the 
-  robots parser would smartly take care of a no-match situation. 
-    
-  If no value is specified, by default HTTP agent (ie. 'http.agent.name') 
-  would be used for user agent matching by the robots parser. 
+
+  The ordering of agents does NOT matter and the robots parser would make
+  decision based on the agent which matches first to the robots rules.
+  Also, there is NO need to add a wildcard (ie. "*") to this string as the
+  robots parser would smartly take care of a no-match situation.
+
+  If no value is specified, by default HTTP agent (ie. 'http.agent.name')
+  would be used for user agent matching by the robots parser.
   </description>
 </property>
 
 <property>
   <name>http.robot.rules.whitelist</name>
   <value></value>
-  <description>Comma separated list of hostnames or IP addresses to ignore 
+  <description>Comma separated list of hostnames or IP addresses to ignore
   robot rules parsing for. Use with care and only if you are explicitly
   allowed by the site owner to ignore the site's robots.txt!
   </description>
@@ -146,7 +146,7 @@
 <property>
   <name>http.agent.url</name>
   <value></value>
-  <description>A URL to advertise in the User-Agent header.  This will 
+  <description>A URL to advertise in the User-Agent header.  This will
    appear in parenthesis after the agent name. Custom dictates that this
    should be a URL of a page explaining the purpose and behavior of this
    crawler.
@@ -165,7 +165,7 @@
 <property>
   <name>http.agent.version</name>
   <value>Nutch-1.14-SNAPSHOT</value>
-  <description>A version string to advertise in the User-Agent 
+  <description>A version string to advertise in the User-Agent
    header.</description>
 </property>
 
@@ -280,7 +280,7 @@
 <property>
   <name>http.proxy.exception.list</name>
   <value></value>
-  <description>A comma separated list of URL's and hosts that don't use the proxy 
+  <description>A comma separated list of URL's and hosts that don't use the proxy
   (e.g. intranets). Example: www.apache.org</description>
 </property>
 
@@ -338,8 +338,8 @@
 <property>
   <name>http.store.responsetime</name>
   <value>true</value>
-  <description>Enables us to record the response time of the 
-  host which is the time period between start connection to end 
+  <description>Enables us to record the response time of the
+  host which is the time period between start connection to end
   connection of a pages host. The response time in milliseconds
   is stored in CrawlDb in CrawlDatum's meta data under key &quot;_rs_&quot;
   </description>
@@ -380,7 +380,7 @@
 
 <property>
   <name>ftp.content.limit</name>
-  <value>65536</value> 
+  <value>65536</value>
   <description>The length limit for downloaded content, in bytes.
   If this value is nonnegative (>=0), content longer than it will be truncated;
   otherwise, no truncation at all.
@@ -454,7 +454,7 @@
   <description>The implementation of fetch schedule. DefaultFetchSchedule simply
   adds the original fetchInterval to the last fetch time, regardless of
   page changes, whereas AdaptiveFetchSchedule (see below) tries to adapt
-  to the rate at which a given page is changed. 
+  to the rate at which a given page is changed.
   </description>
 </property>
 
@@ -524,7 +524,7 @@
   <name>db.preserve.backup</name>
   <value>true</value>
   <description>If true, updatedb will keep a backup of the previous CrawlDB
-  version in the old directory. In case of disaster, one can rename old to 
+  version in the old directory. In case of disaster, one can rename old to
   current and restore the CrawlDB to its previous state.
   </description>
 </property>
@@ -562,8 +562,8 @@
 <property>
   <name>db.update.max.inlinks</name>
   <value>10000</value>
-  <description>Maximum number of inlinks to take into account when updating 
-  a URL score in the crawlDB. Only the best scoring inlinks are kept. 
+  <description>Maximum number of inlinks to take into account when updating
+  a URL score in the crawlDB. Only the best scoring inlinks are kept.
   </description>
 </property>
 
@@ -673,7 +673,7 @@
   <name>db.parsemeta.to.crawldb</name>
   <value></value>
   <description>Comma-separated list of parse metadata keys to transfer to the crawldb (NUTCH-779).
-   Assuming for instance that the languageidentifier plugin is enabled, setting the value to 'lang' 
+   Assuming for instance that the languageidentifier plugin is enabled, setting the value to 'lang'
    will copy both the key 'lang' and its value to the corresponding entry in the crawldb.
   </description>
 </property>
@@ -754,7 +754,7 @@
   <name>generate.count.mode</name>
   <value>host</value>
   <description>Determines how the URLs are counted for generator.max.count.
-  Default value is 'host' but can be 'domain'. Note that we do not count 
+  Default value is 'host' but can be 'domain'. Note that we do not count
   per IP in the new version of the Generator.
   </description>
 </property>
@@ -813,8 +813,8 @@
 <property>
   <name>partition.url.mode</name>
   <value>byHost</value>
-  <description>Determines how to partition URLs. Default value is 'byHost', 
-  also takes 'byDomain' or 'byIP'. 
+  <description>Determines how to partition URLs. Default value is 'byHost',
+  also takes 'byDomain' or 'byIP'.
   </description>
 </property>
 
@@ -822,9 +822,9 @@
   <name>crawl.gen.delay</name>
   <value>604800000</value>
   <description>
-   This value, expressed in milliseconds, defines how long we should keep the lock on records 
-   in CrawlDb that were just selected for fetching. If these records are not updated 
-   in the meantime, the lock is canceled, i.e. they become eligible for selecting. 
+   This value, expressed in milliseconds, defines how long we should keep the lock on records
+   in CrawlDb that were just selected for fetching. If these records are not updated
+   in the meantime, the lock is canceled, i.e. they become eligible for selecting.
    Default value of this is 7 days (604800000 ms).
   </description>
 </property>
@@ -834,9 +834,9 @@
 <property>
   <name>fetcher.server.delay</name>
   <value>5.0</value>
-  <description>The number of seconds the fetcher will delay between 
+  <description>The number of seconds the fetcher will delay between
    successive requests to the same server. Note that this might get
-   overridden by a Crawl-Delay from a robots.txt and is used ONLY if 
+   overridden by a Crawl-Delay from a robots.txt and is used ONLY if
    fetcher.threads.per.queue is set to 1.
    </description>
 </property>
@@ -844,7 +844,7 @@
 <property>
   <name>fetcher.server.min.delay</name>
   <value>0.0</value>
-  <description>The minimum number of seconds the fetcher will delay between 
+  <description>The minimum number of seconds the fetcher will delay between
   successive requests to the same server. This value is applicable ONLY
   if fetcher.threads.per.queue is greater than 1 (i.e. the host blocking
   is turned off).</description>
@@ -860,7 +860,7 @@
  amount of time retrieved from robots.txt Crawl-Delay, however long that
  might be.
  </description>
-</property> 
+</property>
 
 <property>
   <name>fetcher.threads.fetch</name>
@@ -877,10 +877,10 @@
   <name>fetcher.threads.per.queue</name>
   <value>1</value>
   <description>This number is the maximum number of threads that
-    should be allowed to access a queue at one time. Setting it to 
+    should be allowed to access a queue at one time. Setting it to
     a value > 1 will cause the Crawl-Delay value from robots.txt to
     be ignored and the value of fetcher.server.min.delay to be used
-    as a delay between successive requests to the same server instead 
+    as a delay between successive requests to the same server instead
     of fetcher.server.delay.
    </description>
 </property>
@@ -888,8 +888,8 @@
 <property>
   <name>fetcher.queue.mode</name>
   <value>byHost</value>
-  <description>Determines how to put URLs into queues. Default value is 'byHost', 
-  also takes 'byDomain' or 'byIP'. 
+  <description>Determines how to put URLs into queues. Default value is 'byHost',
+  also takes 'byDomain' or 'byIP'.
   </description>
 </property>
 
@@ -916,7 +916,7 @@
   <name>fetcher.timelimit.mins</name>
   <value>-1</value>
   <description>This is the number of minutes allocated to the fetching.
-  Once this value is reached, any remaining entry from the input URL list is skipped 
+  Once this value is reached, any remaining entry from the input URL list is skipped
   and all active queues are emptied. The default value of -1 deactivates the time limit.
   </description>
 </property>
@@ -1008,7 +1008,7 @@
 
 <property>
   <name>fetcher.follow.outlinks.ignore.external</name>
-  <value>true</value>  
+  <value>true</value>
   <description>Whether to ignore or follow external links. Set db.ignore.external.links to false and this to true to store outlinks
   in the output but not follow them. If db.ignore.external.links is true this directive is ignored.
   </description>
@@ -1016,22 +1016,22 @@
 
 <property>
   <name>fetcher.bandwidth.target</name>
-  <value>-1</value>  
-  <description>Target bandwidth in kilobits per sec for each mapper instance. This is used to adjust the number of 
+  <value>-1</value>
+  <description>Target bandwidth in kilobits per sec for each mapper instance. This is used to adjust the number of
   fetching threads automatically (up to fetcher.maxNum.threads). A value of -1 deactivates the functionality, in which case
   the number of fetching threads is fixed (see fetcher.threads.fetch).</description>
 </property>
 
 <property>
   <name>fetcher.maxNum.threads</name>
-  <value>25</value>  
+  <value>25</value>
   <description>Max number of fetch threads allowed when using fetcher.bandwidth.target. Defaults to fetcher.threads.fetch if unspecified or
   set to a value lower than it. </description>
 </property>
 
 <property>
   <name>fetcher.bandwidth.target.check.everyNSecs</name>
-  <value>30</value>  
+  <value>30</value>
   <description>(EXPERT) Value in seconds which determines how frequently we should reassess the optimal number of fetch threads when using
    fetcher.bandwidth.target. Defaults to 30 and must be at least 1.</description>
 </property>
@@ -1052,7 +1052,7 @@
 	<value>false</value>
 	<description>Set this value to true if you want to use an implementation of the Publisher/Subscriber model. Make sure to set corresponding
 	Publisher implementation specific properties</description>
-</property> 
+</property>
 
 <!-- moreindexingfilter plugin properties -->
 
@@ -1098,7 +1098,7 @@
   in given order. For example, if this property has value:
   org.apache.nutch.indexer.basic.BasicIndexingFilter org.apache.nutch.indexer.more.MoreIndexingFilter
   then BasicIndexingFilter is applied first, and MoreIndexingFilter second.
-  
+
   Filter ordering might have impact on result if one filter depends on output of
   another filter.
   </description>
@@ -1191,7 +1191,7 @@
   <name>mime.types.file</name>
   <value>tika-mimetypes.xml</value>
   <description>Name of file in CLASSPATH containing filename extension and
-  magic sequence to mime types mapping information. Overrides the default Tika config 
+  magic sequence to mime types mapping information. Overrides the default Tika config
   if specified.
   </description>
 </property>
@@ -1230,8 +1230,8 @@
   include.  Any plugin not matching this expression is excluded.
   In any case you need at least include the nutch-extensionpoints plugin. By
   default Nutch includes crawling just HTML and plain text via HTTP,
-  and basic indexing and search plugins. In order to use HTTPS please enable 
-  protocol-httpclient, but be aware of possible intermittent problems with the 
+  and basic indexing and search plugins. In order to use HTTPS please enable
+  protocol-httpclient, but be aware of possible intermittent problems with the
   underlying commons-httpclient library. Set parsefilter-naivebayes for classification based focused crawler.
   </description>
 </property>
@@ -1239,7 +1239,7 @@
 <property>
   <name>plugin.excludes</name>
   <value></value>
-  <description>Regular expression naming plugin directory names to exclude.  
+  <description>Regular expression naming plugin directory names to exclude.
   </description>
 </property>
 
@@ -1252,7 +1252,7 @@
     custom tags here will allow for their propagation into a pages outlinks, as
     well as allow for them to be included as part of an index.
     Values should be comma-delimited. ("tag1,tag2,tag3") Do not pad the tags with
-    white-space at their boundaries, if you are using anything earlier than Hadoop-0.21. 
+    white-space at their boundaries, if you are using anything earlier than Hadoop-0.21.
   </description>
 </property>
 
@@ -1310,8 +1310,8 @@
 <property>
   <name>parser.html.outlinks.ignore_tags</name>
   <value></value>
-  <description>Comma separated list of HTML tags, from which outlinks 
-  shouldn't be extracted. Nutch takes links from: a, area, form, frame, 
+  <description>Comma separated list of HTML tags, from which outlinks
+  shouldn't be extracted. Nutch takes links from: a, area, form, frame,
   iframe, script, link, img. If you add any of those tags here, it
   won't be taken. Default is empty list. Probably reasonable value
   for most people would be "img,script,link".</description>
@@ -1340,7 +1340,7 @@
 <property>
   <name>parsefilter.naivebayes.trainfile</name>
   <value>naivebayes-train.txt</value>
-  <description>Set the name of the file to be used for Naive Bayes training. The format will be: 
+  <description>Set the name of the file to be used for Naive Bayes training. The format will be:
 Each line contains two tab separated parts
 There are two columns/parts:
 1. "1" or "0", "1" for relevant and "0" for irrelevant documents.
@@ -1354,7 +1354,7 @@ CAUTION: Set the parser.timeout to -1 or a bigger value than 30, when using this
 <property>
   <name>parsefilter.naivebayes.wordlist</name>
   <value>naivebayes-wordlist.txt</value>
-  <description>Put the name of the file you want to be used as a list of 
+  <description>Put the name of the file you want to be used as a list of
   important words to be matched in the url for the model filter. The format should be one word per line.
   </description>
 </property>
@@ -1362,8 +1362,8 @@ CAUTION: Set the parser.timeout to -1 or a bigger value than 30, when using this
 <property>
   <name>parser.timeout</name>
   <value>30</value>
-  <description>Timeout in seconds for the parsing of a document, otherwise treats it as an exception and 
-  moves on the the following documents. This parameter is applied to any Parser implementation. 
+  <description>Timeout in seconds for the parsing of a document, otherwise treats it as an exception and
+  moves on the the following documents. This parameter is applied to any Parser implementation.
   Set to -1 to deactivate, bearing in mind that this could cause
   the parsing to crash because of a very long or corrupted document.
   </description>
@@ -1384,8 +1384,8 @@ CAUTION: Set the parser.timeout to -1 or a bigger value than 30, when using this
 <property>
   <name>parser.skip.truncated</name>
   <value>true</value>
-  <description>Boolean value for whether we should skip parsing for truncated documents. By default this 
-  property is activated due to extremely high levels of CPU which parsing can sometimes take.  
+  <description>Boolean value for whether we should skip parsing for truncated documents. By default this
+  property is activated due to extremely high levels of CPU which parsing can sometimes take.
   </description>
 </property>
 
@@ -1422,10 +1422,10 @@ CAUTION: Set the parser.timeout to -1 or a bigger value than 30, when using this
   </description>
 </property>
 
-<property> 
+<property>
   <name>tika.extractor.boilerpipe.algorithm</name>
   <value>ArticleExtractor</value>
-  <description> 
+  <description>
   Which Boilerpipe algorithm to use. Valid values are: DefaultExtractor, ArticleExtractor
   or CanolaExtractor.
   </description>
@@ -1507,7 +1507,7 @@ CAUTION: Set the parser.timeout to -1 or a bigger value than 30, when using this
   <description>Max depth value from seed allowed by default.
   Can be overridden on a per-seed basis by specifying "_maxdepth_=VALUE"
   as a seed metadata. This plugin adds a "_depth_" metadatum to the pages
-  to track the distance from the seed it was found from. 
+  to track the distance from the seed it was found from.
   The depth is used to prioritise URLs in the generation step so that
   shallower pages are fetched first.
   </description>
@@ -1515,15 +1515,15 @@ CAUTION: Set the parser.timeout to -1 or a bigger value than 30, when using this
 
 <!-- scoring similarity properties
 Add scoring-similarity to the list of active plugins
- in the parameter 'plugin.includes' in order to use it. 
-For more detailed information on the working of this filter 
+ in the parameter 'plugin.includes' in order to use it.
+For more detailed information on the working of this filter
 visit https://wiki.apache.org/nutch/SimilarityScoringFilter-->
 
 <property>
     <name>scoring.similarity.model</name>
     <value>cosine</value>
     <description>The type of similarity metric to use. Eg - cosine (which is, currently, the only available model).
-      Please make sure to set the model specific properties for the scoring to function properly. 
+      Please make sure to set the model specific properties for the scoring to function properly.
       Description of these properties can be found on the wiki.
     </description>
 </property>
@@ -1539,7 +1539,7 @@ visit https://wiki.apache.org/nutch/SimilarityScoringFilter-->
 <property>
     <name>cosine.goldstandard.file</name>
     <value>goldstandard.txt</value>
-    <description>Path to the gold standard file which contains all the relevant text and terms, 
+    <description>Path to the gold standard file which contains all the relevant text and terms,
       pertaining to the domain.
     </description>
 </property>
@@ -1547,7 +1547,7 @@ visit https://wiki.apache.org/nutch/SimilarityScoringFilter-->
  <property>
     <name>scoring.similarity.stopword.file</name>
     <value>stopwords.txt</value>
-    <description>Name of the stopword text file. The user can specify a custom list of stop words 
+    <description>Name of the stopword text file. The user can specify a custom list of stop words
       in a text file. Each new stopword should be on a new line.
     </description>
 </property>
@@ -1614,11 +1614,11 @@ visit https://wiki.apache.org/nutch/SimilarityScoringFilter-->
   <name>index.static</name>
   <value></value>
   <description>
-  Used by plugin index-static to adds fields with static data at indexing time. 
+  Used by plugin index-static to adds fields with static data at indexing time.
   You can specify a comma-separated list of fieldname:fieldcontent per Nutch job.
   Each fieldcontent can have multiple values separated by space, e.g.,
     field1:value1.1 value1.2 value1.3,field2:value2.1 value2.2 ...
-  It can be useful when collections can't be created by URL patterns, 
+  It can be useful when collections can't be created by URL patterns,
   like in subcollection, but on a job-basis.
   </description>
 </property>
@@ -1659,7 +1659,7 @@ visit https://wiki.apache.org/nutch/SimilarityScoringFilter-->
   <description>
   Comma-separated list of keys to be taken from the parse metadata to generate fields.
   Can be used e.g. for 'description' or 'keywords' provided that these values are generated
-  by a parser (see parse-metatags plugin)  
+  by a parser (see parse-metatags plugin)
   </description>
 </property>
 
@@ -1667,7 +1667,7 @@ visit https://wiki.apache.org/nutch/SimilarityScoringFilter-->
   <name>index.content.md</name>
   <value></value>
   <description>
-   Comma-separated list of keys to be taken from the content metadata to generate fields. 
+   Comma-separated list of keys to be taken from the content metadata to generate fields.
   </description>
 </property>
 
@@ -1676,7 +1676,7 @@ visit https://wiki.apache.org/nutch/SimilarityScoringFilter-->
   <value></value>
   <description>
      Comma-separated list of keys to be taken from the crawldb metadata to generate fields.
-     Can be used to index values propagated from the seeds with the plugin urlmeta 
+     Can be used to index values propagated from the seeds with the plugin urlmeta
   </description>
 </property>
 
@@ -1686,9 +1686,9 @@ visit https://wiki.apache.org/nutch/SimilarityScoringFilter-->
   <value>insightsService</value>
   <description>
   A string representing the information source to be used for GeoIP information
-  association. Either enter 'cityDatabase', 'connectionTypeDatabase', 
-  'domainDatabase', 'ispDatabase' or 'insightsService'. If you wish to use any one of the 
-  Database options, you should make one of GeoIP2-City.mmdb, GeoIP2-Connection-Type.mmdb, 
+  association. Either enter 'cityDatabase', 'connectionTypeDatabase',
+  'domainDatabase', 'ispDatabase' or 'insightsService'. If you wish to use any one of the
+  Database options, you should make one of GeoIP2-City.mmdb, GeoIP2-Connection-Type.mmdb,
   GeoIP2-Domain.mmdb or GeoIP2-ISP.mmdb files respectively available on the classpath and
   available at runtime.
   </description>
@@ -1733,8 +1733,8 @@ visit https://wiki.apache.org/nutch/SimilarityScoringFilter-->
   <value>description,keywords</value>
   <description> Names of the metatags to extract, separated by ','.
   Use '*' to extract all metatags. Prefixes the names with 'metatag.'
-  in the parse-metadata. For instance to index description and keywords, 
-  you need to activate the plugin index-metadata and set the value of the 
+  in the parse-metadata. For instance to index description and keywords,
+  you need to activate the plugin index-metadata and set the value of the
   parameter 'index.parse.md' to 'metatag.description,metatag.keywords'.
   </description>
 </property>
@@ -1786,7 +1786,7 @@ visit https://wiki.apache.org/nutch/SimilarityScoringFilter-->
   <name>link.ignore.limit.domain</name>
   <value>true</value>
   <description>Limit to only a single outlink to the same domain.</description>
-</property> 
+</property>
 
 <property>
   <name>link.analyze.num.iterations</name>
@@ -1812,7 +1812,7 @@ visit https://wiki.apache.org/nutch/SimilarityScoringFilter-->
   <description>Whether to delete gone pages from the web graph.</description>
 </property>
 
-<property> 
+<property>
   <name>link.loops.depth</name>
   <value>2</value>
   <description>The depth for the loops algorithm.</description>
@@ -1827,7 +1827,7 @@ visit https://wiki.apache.org/nutch/SimilarityScoringFilter-->
 <property>
   <name>mapreduce.fileoutputcommitter.marksuccessfuljobs</name>
   <value>false</value>
-  <description>Hadoop >= 0.21 generates SUCCESS files in the output which can crash 
+  <description>Hadoop >= 0.21 generates SUCCESS files in the output which can crash
   the readers. This should not be an issue once Nutch is ported to the new MapReduce API
   but for now this parameter should prevent such cases.
   </description>
@@ -1841,7 +1841,7 @@ visit https://wiki.apache.org/nutch/SimilarityScoringFilter-->
   <description>
     Specifies the SolrServer implementation to use. This is a string value
     of one of the following 'cloud', 'concurrent', 'http' or 'lb'.
-    The values represent CloudSolrServer, ConcurrentUpdateSolrServer, 
+    The values represent CloudSolrServer, ConcurrentUpdateSolrServer,
     HttpSolrServer or LBHttpSolrServer respectively.
   </description>
 </property>
@@ -1859,7 +1859,7 @@ visit https://wiki.apache.org/nutch/SimilarityScoringFilter-->
   <name>solr.zookeeper.url</name>
   <value></value>
   <description>
-      Defines the Zookeeper URL which is an essential setting to be used 
+      Defines the Zookeeper URL which is an essential setting to be used
       when using SolrCloud. This should be a fully qualified URL similar to
       the property provided within 'solr.server.url' above.
   </description>
@@ -1870,7 +1870,7 @@ visit https://wiki.apache.org/nutch/SimilarityScoringFilter-->
   <value></value>
   <description>
       A comma-separated value representing the Solr servers to be used when
-      initiating LBHttpSolrServer as the SolrServer implementation. 
+      initiating LBHttpSolrServer as the SolrServer implementation.
   </description>
 </property>
 
@@ -1883,7 +1883,7 @@ visit https://wiki.apache.org/nutch/SimilarityScoringFilter-->
   </description>
 </property>
 
-<property> 
+<property>
   <name>solr.commit.size</name>
   <value>250</value>
   <description>
@@ -1897,7 +1897,7 @@ visit https://wiki.apache.org/nutch/SimilarityScoringFilter-->
   <name>solr.commit.index</name>
   <value>true</value>
   <description>
-  When closing the indexer, trigger a commit to the Solr server. 
+  When closing the indexer, trigger a commit to the Solr server.
   </description>
 </property>
 
@@ -1920,34 +1920,34 @@ visit https://wiki.apache.org/nutch/SimilarityScoringFilter-->
   TransportClient. Either host and port must be defined or cluster.</description>
 </property>
 
-<property> 
+<property>
   <name>elastic.port</name>
   <value>9300</value>
   <description>The port to connect to using TransportClient.</description>
 </property>
 
-<property> 
+<property>
   <name>elastic.cluster</name>
   <value></value>
   <description>The cluster name to discover. Either host and port must be defined
   or cluster.</description>
 </property>
 
-<property> 
+<property>
   <name>elastic.index</name>
-  <value>nutch</value> 
+  <value>nutch</value>
   <description>Default index to send documents to.</description>
 </property>
 
-<property> 
+<property>
   <name>elastic.max.bulk.docs</name>
-  <value>250</value> 
+  <value>250</value>
   <description>Maximum size of the bulk in number of documents.</description>
 </property>
 
-<property> 
+<property>
   <name>elastic.max.bulk.size</name>
-  <value>2500500</value> 
+  <value>2500500</value>
   <description>Maximum size of the bulk in bytes.</description>
 </property>
 
@@ -2175,7 +2175,7 @@ visit https://wiki.apache.org/nutch/SimilarityScoringFilter-->
   <name>page.load.delay</name>
   <value>3</value>
   <description>
-    The delay in seconds to use when loading a page with htmlunit or selenium. 
+    The delay in seconds to use when loading a page with htmlunit or selenium.
   </description>
 </property>
 
@@ -2210,7 +2210,7 @@ visit https://wiki.apache.org/nutch/SimilarityScoringFilter-->
   <value>true</value>
   <description>
     A Boolean value representing if javascript should
-    be enabled or disabled when using htmlunit. The default value is enabled. 
+    be enabled or disabled when using htmlunit. The default value is enabled.
   </description>
 </property>
 
@@ -2219,7 +2219,7 @@ visit https://wiki.apache.org/nutch/SimilarityScoringFilter-->
   <value>3500</value>
   <description>
     The timeout in milliseconds when loading javascript with lib-htmlunit. This
-    setting is used by protocol-htmlunit since they depending on 
+    setting is used by protocol-htmlunit since they depending on
     lib-htmlunit for fetching.
   </description>
 </property>
@@ -2239,7 +2239,7 @@ visit https://wiki.apache.org/nutch/SimilarityScoringFilter-->
   <name>selenium.driver</name>
   <value>firefox</value>
   <description>
-    A String value representing the flavour of Selenium 
+    A String value representing the flavour of Selenium
     WebDriver() to use. Currently the following options
     exist - 'firefox', 'chrome', 'safari', 'opera', 'phantomjs' and 'remote'.
     If 'remote' is used it is essential to also set correct properties for
@@ -2275,7 +2275,7 @@ visit https://wiki.apache.org/nutch/SimilarityScoringFilter-->
 <property>
   <name>selenium.grid.driver</name>
   <value>firefox</value>
-  <description>A String value representing the flavour of Selenium 
+  <description>A String value representing the flavour of Selenium
     WebDriver() used on the selenium grid. Currently the following options
     exist - 'firefox', 'phantomjs' </description>
 </property>
@@ -2283,18 +2283,18 @@ visit https://wiki.apache.org/nutch/SimilarityScoringFilter-->
 <property>
   <name>selenium.grid.binary</name>
   <value></value>
-  <description>A String value representing the path to the browser binary 
+  <description>A String value representing the path to the browser binary
     location for each node
  </description>
 </property>
 
-<!-- selenium firefox configuration; 
+<!-- selenium firefox configuration;
      applies to protocol-selenium and protocol-interactiveselenium plugins -->
 <property>
   <name>selenium.firefox.allowed.hosts</name>
   <value>localhost</value>
   <description>A String value representing the allowed hosts preference
-  according to the operating system hosts file (Example - /etc/hosts in Unix). 
+  according to the operating system hosts file (Example - /etc/hosts in Unix).
   Currently this option exist for - 'firefox' </description>
 </property>
 
@@ -2302,7 +2302,7 @@ visit https://wiki.apache.org/nutch/SimilarityScoringFilter-->
   <name>selenium.firefox.binary.timeout</name>
   <value>45</value>
   <description>A Long value representing the timeout value
-  for firefox to be available for command execution. The value is in seconds. 
+  for firefox to be available for command execution. The value is in seconds.
   Currently this option exist for - 'firefox' </description>
 </property>
 
@@ -2310,7 +2310,7 @@ visit https://wiki.apache.org/nutch/SimilarityScoringFilter-->
   <name>selenium.firefox.enable.flash</name>
   <value>false</value>
   <description>A Boolean value representing if flash should
-  be enabled or disabled. The default value is disabled. 
+  be enabled or disabled. The default value is disabled.
   Currently this option exist for - 'firefox' </description>
 </property>
 
@@ -2322,7 +2322,7 @@ visit https://wiki.apache.org/nutch/SimilarityScoringFilter-->
   Other options are:
   1: Load all images, regardless of origin
   2: Block all images
-  3: Prevent third-party images from loading 
+  3: Prevent third-party images from loading
   Currently this option exist for - 'firefox' </description>
 </property>
 
@@ -2330,7 +2330,7 @@ visit https://wiki.apache.org/nutch/SimilarityScoringFilter-->
   <name>selenium.firefox.load.stylesheet</name>
   <value>1</value>
   <description>An Integer value representing the restriction on
-  loading stylesheet. The default value is no restriction i.e. load 
+  loading stylesheet. The default value is no restriction i.e. load
   all stylesheet.
   Other options are:
   1: Load all stylesheet
@@ -2338,6 +2338,16 @@ visit https://wiki.apache.org/nutch/SimilarityScoringFilter-->
   Currently this option exist for - 'firefox' </description>
 </property>
 
+<property>
+    <name>selenium.firefox.headless</name>
+    <value>false</value>
+    <description>A Boolean value representing if firefox should
+      run headless . make sure that firefox version is 55 or later,
+      and selenium webDriver version is 3.6.0 or later. The default value is false.
+      Currently this option exist for - 'firefox' </description>
+  </property>
+
+
 <!-- protocol-interactiveselenium configuration -->
 <property>
   <name>interactiveselenium.handlers</name>
@@ -2362,7 +2372,7 @@ visit https://wiki.apache.org/nutch/SimilarityScoringFilter-->
   <name>store.http.headers</name>
   <value>false</value>
   <description>
-    Store the raw headers received by Nutch from the server, required to use the 
+    Store the raw headers received by Nutch from the server, required to use the
     CommonCrawlDataDumper tool for the WARC format.
   </description>
 </property>
@@ -2373,7 +2383,7 @@ visit https://wiki.apache.org/nutch/SimilarityScoringFilter-->
   <name>index.links.outlinks.host.ignore</name>
   <value>false</value>
   <description>
-    Ignore outlinks that point out to the same host as the URL being indexed. 
+    Ignore outlinks that point out to the same host as the URL being indexed.
     By default all outlinks are indexed. If db.ignore.internal.links is true (default
     value), this setting does nothing since the internal links are already
     ignored.
@@ -2384,7 +2394,7 @@ visit https://wiki.apache.org/nutch/SimilarityScoringFilter-->
   <name>index.links.inlinks.host.ignore</name>
   <value>false</value>
   <description>
-    Ignore inlinks coming from the same host as the URL being indexed. By default 
+    Ignore inlinks coming from the same host as the URL being indexed. By default
     all inlinks are indexed. If db.ignore.internal.links is true (default
     value), this setting does nothing since the internal links are already
     ignored.
@@ -2508,15 +2518,15 @@ visit https://wiki.apache.org/nutch/SimilarityScoringFilter-->
   </description>
 </property>
 
-<!-- publisher properties 
-      Do not forget to add the name of your publisher implementation 
+<!-- publisher properties
+      Do not forget to add the name of your publisher implementation
       in plugin.includes ex- publish-rabbitmq -->
 <property>
   <name>publisher.queue.type</name>
   <value></value>
   <description>
-    Choose the type of Queue being used (ex - RabbitMQ, ActiveMq, Kafka, etc). 
-    Currently there exists an implemtation for RabbitMQ producer. 
+    Choose the type of Queue being used (ex - RabbitMQ, ActiveMq, Kafka, etc).
+    Currently there exists an implemtation for RabbitMQ producer.
   </description>
 </property>
 
@@ -2607,7 +2617,7 @@ visit https://wiki.apache.org/nutch/SimilarityScoringFilter-->
   <value></value>
   <description>
     Default is 'fanout.key'
-    The routingKey used by publisher to publish messages to specific queues. 
+    The routingKey used by publisher to publish messages to specific queues.
     If the exchange type is "fanout", then this property is ignored.
   </description>
 </property>
diff --git a/src/plugin/lib-selenium/ivy.xml b/src/plugin/lib-selenium/ivy.xml
index 701b7257d3..8cd28237e7 100644
--- a/src/plugin/lib-selenium/ivy.xml
+++ b/src/plugin/lib-selenium/ivy.xml
@@ -22,7 +22,7 @@
     <license name="Apache 2.0"/>
     <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
     <description>
-        Apache Nutch
+      Apache Nutch
     </description>
   </info>
 
@@ -37,16 +37,12 @@
 
   <dependencies>
     <!-- begin selenium dependencies -->
-    <dependency org="org.seleniumhq.selenium" name="selenium-java" rev="2.48.2" />
-    
-    <dependency org="com.opera" name="operadriver" rev="1.5">
+    <dependency org="org.seleniumhq.selenium" name="selenium-java" rev="3.7.1" />
+    <dependency org="com.codeborne" name="phantomjsdriver" rev="1.4.0" >
       <exclude org="org.seleniumhq.selenium" name="selenium-remote-driver" />
-    </dependency>
-    <dependency org="com.codeborne" name="phantomjsdriver" rev="1.2.1" >
-      <exclude org="org.seleniumhq.selenium" name="selenium-remote-driver" />
-      <exclude org="org.seleniumhq.selenium" name="selenium-java" />
+      <exclude org="org.seleniumhq.selenium" name="selenium-api" />
     </dependency>
     <!-- end selenium dependencies -->
   </dependencies>
-  
+
 </ivy-module>
diff --git a/src/plugin/lib-selenium/plugin.xml b/src/plugin/lib-selenium/plugin.xml
index a86d665dc4..b062a6179b 100644
--- a/src/plugin/lib-selenium/plugin.xml
+++ b/src/plugin/lib-selenium/plugin.xml
@@ -19,157 +19,84 @@
  ! A common framework for http protocol implementations
  !-->
 <plugin
-   id="lib-selenium"
-   name="HTTP Framework"
-   version="1.0"
-   provider-name="org.apache.nutch">
+        id="lib-selenium"
+        name="HTTP Framework"
+        version="1.0"
+        provider-name="org.apache.nutch">
 
-   <runtime>
-     <library name="lib-selenium.jar">
-        <export name="*"/>
-     </library>
-     <!-- all classes from dependent libraries are exported -->
-     <library name="cglib-nodep-2.1_3.jar">
-       <export name="*"/>
-     </library>
-     <library name="commons-codec-1.10.jar">
-       <export name="*"/>
-     </library>
-     <library name="commons-collections-3.2.1.jar">
-       <export name="*"/>
-     </library>
-     <library name="commons-exec-1.3.jar">
-       <export name="*"/>
-     </library>
-     <library name="commons-io-2.4.jar">
-       <export name="*"/>
-     </library>
-     <library name="commons-jxpath-1.3.jar">
-       <export name="*"/>
-     </library>
-     <library name="commons-lang3-3.4.jar">
-       <export name="*"/>
-     </library>
-     <library name="commons-logging-1.2.jar">
-       <export name="*"/>
-     </library>
-     <library name="cssparser-0.9.16.jar">
-       <export name="*"/>
-     </library>
-     <library name="gson-2.3.1.jar">
-       <export name="*"/>
-     </library>
-     <library name="guava-18.0.jar">
-       <export name="*"/>
-     </library>
-     <library name="htmlunit-2.18.jar">
-       <export name="*"/>
-     </library>
-     <library name="htmlunit-core-js-2.17.jar">
-       <export name="*"/>
-     </library>
-     <library name="httpclient-4.5.1.jar">
-       <export name="*"/>
-     </library>
-     <library name="httpcore-4.4.3.jar">
-       <export name="*"/>
-     </library>
-     <library name="httpmime-4.5.jar">
-       <export name="*"/>
-     </library>
-     <library name="ini4j-0.5.2.jar">
-       <export name="*"/>
-     </library>
-     <library name="jetty-io-9.2.12.v20150709.jar">
-       <export name="*"/>
-     </library>
-     <library name="jetty-util-9.2.12.v20150709.jar">
-       <export name="*"/>
-     </library>
-     <library name="jna-4.1.0.jar">
-       <export name="*"/>
-     </library>
-     <library name="jna-platform-4.1.0.jar">
-       <export name="*"/>
-     </library>
-     <library name="nekohtml-1.9.22.jar">
-       <export name="*"/>
-     </library>
-     <library name="netty-3.5.2.Final.jar">
-       <export name="*"/>
-     </library>
-     <library name="operadriver-1.5.jar">
-       <export name="*"/>
-     </library>
-     <library name="operalaunchers-1.1.jar">
-       <export name="*"/>
-     </library>
-     <library name="phantomjsdriver-1.2.1.jar">
-       <export name="*"/>
-     </library>
-     <library name="protobuf-java-2.4.1.jar">
-       <export name="*"/>
-     </library>
-     <library name="sac-1.3.jar">
-       <export name="*"/>
-     </library>
-     <library name="selenium-api-2.48.2.jar">
-       <export name="*"/>
-     </library>
-     <library name="selenium-chrome-driver-2.48.2.jar">
-       <export name="*"/>
-     </library>
-     <library name="selenium-edge-driver-2.48.2.jar">
-       <export name="*"/>
-     </library>
-     <library name="selenium-firefox-driver-2.48.2.jar">
-       <export name="*"/>
-     </library>
-     <library name="selenium-htmlunit-driver-2.48.2.jar">
-       <export name="*"/>
-     </library>
-     <library name="selenium-ie-driver-2.48.2.jar">
-       <export name="*"/>
-     </library>
-     <library name="selenium-java-2.48.2.jar">
-       <export name="*"/>
-     </library>
-     <library name="selenium-leg-rc-2.48.2.jar">
-       <export name="*"/>
-     </library>
-     <library name="selenium-remote-driver-2.48.2.jar">
-       <export name="*"/>
-     </library>
-     <library name="selenium-safari-driver-2.48.2.jar">
-       <export name="*"/>
-     </library>
-     <library name="selenium-support-2.48.2.jar">
-       <export name="*"/>
-     </library>
-     <library name="serializer-2.7.2.jar">
-       <export name="*"/>
-     </library>
-     <library name="webbit-0.4.14.jar">
-       <export name="*"/>
-     </library>
-     <library name="websocket-api-9.2.12.v20150709.jar">
-       <export name="*"/>
-     </library>
-     <library name="websocket-client-9.2.12.v20150709.jar">
-       <export name="*"/>
-     </library>
-     <library name="websocket-common-9.2.12.v20150709.jar">
-       <export name="*"/>
-     </library>
-     <library name="xalan-2.7.2.jar">
-       <export name="*"/>
-     </library>
-     <library name="xercesImpl-2.11.0.jar">
-       <export name="*"/>
-     </library>
-     <library name="xml-apis-1.4.01.jar">
-       <export name="*"/>
-     </library>
-   </runtime>
+  <runtime>
+    <library name="lib-selenium.jar">
+      <export name="*"/>
+    </library>
+    <library name="animal-sniffer-annotations-1.14.jar">
+      <export name="*"/>
+    </library>
+    <library name="byte-buddy-1.7.5.jar">
+      <export name="*"/>
+    </library>
+    <library name="commons-codec-1.9.jar">
+      <export name="*"/>
+    </library>
+    <library name="commons-exec-1.3.jar">
+      <export name="*"/>
+    </library>
+    <library name="commons-logging-1.2.jar">
+      <export name="*"/>
+    </library>
+    <library name="error_prone_annotations-2.0.18.jar">
+      <export name="*"/>
+    </library>
+    <library name="gson-2.8.2.jar">
+      <export name="*"/>
+    </library>
+    <library name="guava-23.0.jar">
+      <export name="*"/>
+    </library>
+    <library name="httpclient-4.5.3.jar">
+      <export name="*"/>
+    </library>
+    <library name="httpcore-4.4.6.jar">
+      <export name="*"/>
+    </library>
+    <library name="j2objc-annotations-1.1.jar">
+      <export name="*"/>
+    </library>
+    <library name="jsr305-1.3.9.jar">
+      <export name="*"/>
+    </library>
+    <library name="phantomjsdriver-1.4.0.jar">
+      <export name="*"/>
+    </library>
+    <library name="selenium-api-3.7.1.jar">
+      <export name="*"/>
+    </library>
+    <library name="selenium-chrome-driver-3.7.1.jar">
+      <export name="*"/>
+    </library>
+    <library name="selenium-edge-driver-3.7.1.jar">
+      <export name="*"/>
+    </library>
+    <library name="selenium-firefox-driver-3.7.1.jar">
+      <export name="*"/>
+    </library>
+    <library name="selenium-ie-driver-3.7.1.jar">
+      <export name="*"/>
+    </library>
+    <library name="selenium-java-3.7.1.jar">
+      <export name="*"/>
+    </library>
+    <library name="selenium-opera-driver-3.7.1.jar">
+      <export name="*"/>
+    </library>
+    <library name="selenium-remote-driver-3.7.1.jar">
+      <export name="*"/>
+    </library>
+    <library name="selenium-safari-driver-3.7.1.jar">
+      <export name="*"/>
+    </library>
+    <library name="selenium-support-3.7.1.jar">
+      <export name="*"/>
+    </library>
+  </runtime>
 
 </plugin>
diff --git a/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java b/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java
index 6e137f9bf0..671c91ec05 100644
--- a/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java
+++ b/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java
@@ -37,6 +37,7 @@
 import org.openqa.selenium.chrome.ChromeDriver;
 import org.openqa.selenium.firefox.FirefoxBinary;
 import org.openqa.selenium.firefox.FirefoxDriver;
+import org.openqa.selenium.firefox.FirefoxOptions;
 import org.openqa.selenium.firefox.FirefoxProfile;
 import org.openqa.selenium.io.TemporaryFilesystem;
 import org.openqa.selenium.remote.DesiredCapabilities;
@@ -46,9 +47,8 @@
 import org.openqa.selenium.phantomjs.PhantomJSDriverService;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
-
-import com.opera.core.systems.OperaDriver;
-
+import org.openqa.selenium.opera.OperaDriver;
+import org.openqa.selenium.opera.OperaOptions;
 public class HttpWebClient {
 
   private static final Logger LOG = LoggerFactory
@@ -59,12 +59,14 @@ public class HttpWebClient {
     @Override
     protected WebDriver initialValue()
     {
+      FirefoxOptions options = new FirefoxOptions();
       FirefoxProfile profile = new FirefoxProfile();
       profile.setPreference("permissions.default.stylesheet", 2);
       profile.setPreference("permissions.default.image", 2);
       profile.setPreference("dom.ipc.plugins.enabled.libflashplayer.so", "false");
       profile.setPreference(FirefoxProfile.ALLOWED_HOSTS_PREFERENCE, "localhost");
-      WebDriver driver = new FirefoxDriver(profile);
+      options.setProfile(profile);
+      WebDriver driver = new FirefoxDriver(options);
       return driver;          
     };
   };
@@ -81,16 +83,23 @@ public static WebDriver getDriverForPage(String url, Configuration conf) {
           	String allowedHost = conf.get("selenium.firefox.allowed.hosts", "localhost");
           	long firefoxBinaryTimeout = conf.getLong("selenium.firefox.binary.timeout", 45);
           	boolean enableFlashPlayer = conf.getBoolean("selenium.firefox.enable.flash", false);
+            boolean headless = conf.getBoolean("selenium.firefox.headless", false);
           	int loadImage = conf.getInt("selenium.firefox.load.image", 1);
           	int loadStylesheet = conf.getInt("selenium.firefox.load.stylesheet", 1);
-    		    FirefoxProfile profile = new FirefoxProfile();
-    		    FirefoxBinary binary = new FirefoxBinary();
-    		    profile.setPreference(FirefoxProfile.ALLOWED_HOSTS_PREFERENCE, allowedHost);
-    		    profile.setPreference("dom.ipc.plugins.enabled.libflashplayer.so", enableFlashPlayer);
-    		    profile.setPreference("permissions.default.stylesheet", loadStylesheet);
-  	      	profile.setPreference("permissions.default.image", loadImage);
-    		    binary.setTimeout(TimeUnit.SECONDS.toMillis(firefoxBinaryTimeout));
-            driver = new FirefoxDriver(binary, profile);
+            FirefoxOptions options = new FirefoxOptions();
+            FirefoxProfile profile = new FirefoxProfile();
+            FirefoxBinary binary = new FirefoxBinary();
+            if(headless) {
+              binary.addCommandLineOptions("--headless");
+            }
+            profile.setPreference(FirefoxProfile.ALLOWED_HOSTS_PREFERENCE, allowedHost);
+            profile.setPreference("dom.ipc.plugins.enabled.libflashplayer.so", enableFlashPlayer);
+            profile.setPreference("permissions.default.stylesheet", loadStylesheet);
+            profile.setPreference("permissions.default.image", loadImage);
+            binary.setTimeout(TimeUnit.SECONDS.toMillis(firefoxBinaryTimeout));
+            options.setProfile(profile);
+            options.setBinary(binary);
+            driver = new FirefoxDriver(options);
             break;
           case "chrome":
             driver = new ChromeDriver();
@@ -166,7 +175,7 @@ public static String getHTMLContent(WebDriver driver, Configuration conf) {
   public static void cleanUpDriver(WebDriver driver) {
     if (driver != null) {
       try {
-	      driver.close();
+//	      driver.close();
         driver.quit();
         TemporaryFilesystem.getDefaultTmpFS().deleteTemporaryFiles();
       } catch (Exception e) {
diff --git a/src/plugin/protocol-interactiveselenium/README.md b/src/plugin/protocol-interactiveselenium/README.md
index dd43ee7948..95707f0308 100644
--- a/src/plugin/protocol-interactiveselenium/README.md
+++ b/src/plugin/protocol-interactiveselenium/README.md
@@ -36,3 +36,12 @@ Only basic functionality is included in the DefaultHandler that comes with the p
 # Handler Info
 
 Handlers are called in the order that they're specified in the configuration. A "clean" driver is used for each handler so multiple handlers won't interfere with each other. Page content is appended together from each handler and returned for the request.
+
+
+# Using headless mode
+
+headless option is added recently for firefox and chrome (firefox version >= 55 , chrome version >= 59) .
+we used to rely on xvfb and its associates for running in headless mode, now that the option is available it's better that we use it.
+you can use this option by setting the selenium.firefox.headless to True, default value is false .
+this option is tested using firefox 57.0 , gecodriver 0.19.1 and selenium 3.7.1
+Currently this option exist for - 'firefox' .