Skip to content

Commit

Permalink
remove metaData
Browse files Browse the repository at this point in the history
  • Loading branch information
marevol committed Dec 19, 2024
1 parent 405751d commit 605d287
Show file tree
Hide file tree
Showing 6 changed files with 47 additions and 41 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -151,8 +151,8 @@ public void run() {
log(logHelper, LogType.GET_CONTENT, crawlerContext, urlQueue);
// access an url
final long startTime = SystemUtil.currentTimeMillis();
responseData = client.execute(
RequestDataBuilder.newRequestData().method(urlQueue.getMethod()).url(urlQueue.getUrl()).build());
responseData = client.execute(RequestDataBuilder.newRequestData().method(urlQueue.getMethod())
.url(urlQueue.getUrl()).weight(urlQueue.getWeight()).build());
responseData.setExecutionTime(SystemUtil.currentTimeMillis() - startTime);
responseData.setParentUrl(urlQueue.getParentUrl());
responseData.setSessionId(crawlerContext.sessionId);
Expand All @@ -163,7 +163,7 @@ public void run() {
} else {
log(logHelper, LogType.REDIRECT_LOCATION, crawlerContext, urlQueue, responseData);
// redirect
storeChildUrl(responseData.getRedirectLocation(), urlQueue.getUrl(), null,
storeChildUrl(responseData.getRedirectLocation(), urlQueue.getUrl(), urlQueue.getWeight(),
urlQueue.getDepth() == null ? 1 : urlQueue.getDepth() + 1);
}
}
Expand Down Expand Up @@ -234,7 +234,8 @@ protected void addSitemapsFromRobotsTxt(final UrlQueue<?> urlQueue) {
if (sitemaps != null) {
for (final String childUrl : sitemaps) {
try {
storeChildUrl(childUrl, urlQueue.getUrl(), null, urlQueue.getDepth() == null ? 1 : urlQueue.getDepth() + 1);
storeChildUrl(childUrl, urlQueue.getUrl(), urlQueue.getWeight(),
urlQueue.getDepth() == null ? 1 : urlQueue.getDepth() + 1);
} catch (final Exception e) {
log(logHelper, LogType.PROCESS_CHILD_URL_BY_EXCEPTION, crawlerContext, urlQueue, childUrl, e);
}
Expand All @@ -253,7 +254,8 @@ protected boolean isContentUpdated(final CrawlerClient client, final UrlQueue<?>
ResponseData responseData = null;
try {
// head method
responseData = client.execute(RequestDataBuilder.newRequestData().head().url(urlQueue.getUrl()).build());
responseData = client
.execute(RequestDataBuilder.newRequestData().head().url(urlQueue.getUrl()).weight(urlQueue.getWeight()).build());
if (responseData != null && responseData.getLastModified() != null
&& responseData.getLastModified().getTime() <= urlQueue.getLastModified().longValue()
&& responseData.getHttpStatusCode() == 200) {
Expand Down Expand Up @@ -301,22 +303,23 @@ protected void storeChildUrls(final Set<RequestData> childUrlList, final String

// add url and filter
final Set<String> urlSet = new HashSet<>();
final List<UrlQueue<?>> childList = childUrlList.stream().filter(d -> StringUtil.isNotBlank(d.getUrl())
&& urlSet.add(d.getUrl() + "\n" + d.getMetaData()) && crawlerContext.urlFilter.match(d.getUrl())).map(d -> {
final List<UrlQueue<?>> childList = childUrlList.stream()
.filter(d -> StringUtil.isNotBlank(d.getUrl()) && urlSet.add(d.getUrl()) && crawlerContext.urlFilter.match(d.getUrl()))
.map(d -> {
final UrlQueue<?> uq = crawlerContainer.getComponent("urlQueue");
uq.setCreateTime(SystemUtil.currentTimeMillis());
uq.setDepth(depth);
uq.setMethod(Constants.GET_METHOD);
uq.setParentUrl(url);
uq.setSessionId(crawlerContext.sessionId);
uq.setUrl(d.getUrl());
uq.setMetaData(d.getMetaData());
uq.setWeight(d.getWeight());
return uq;
}).collect(Collectors.toList());
urlQueueService.offerAll(crawlerContext.sessionId, childList);
}

protected void storeChildUrl(final String childUrl, final String parentUrl, final String metaData, final int depth) {
protected void storeChildUrl(final String childUrl, final String parentUrl, final float weight, final int depth) {
if (crawlerContext.getMaxDepth() >= 0 && depth > crawlerContext.getMaxDepth()) {
return;
}
Expand All @@ -331,7 +334,7 @@ protected void storeChildUrl(final String childUrl, final String parentUrl, fina
uq.setParentUrl(parentUrl);
uq.setSessionId(crawlerContext.sessionId);
uq.setUrl(childUrl);
uq.setMetaData(metaData);
uq.setWeight(weight);
childList.add(uq);
urlQueueService.offerAll(crawlerContext.sessionId, childList);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -75,8 +75,8 @@ public RequestDataContext url(final String url) {
return this;
}

public RequestDataContext metaData(final String metaData) {
data.setMetaData(metaData);
public RequestDataContext weight(final float weight) {
data.setWeight(weight);
return this;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@
*/
package org.codelibs.fess.crawler.entity;

import java.util.Collections;
import java.util.HashMap;
import java.util.Map;
import java.util.Objects;

import org.codelibs.fess.crawler.Constants;
Expand All @@ -32,7 +35,7 @@ public enum Method {

private String url;

private String metaData;
private float weight = 1.0f;

public Method getMethod() {
return method;
Expand Down Expand Up @@ -62,36 +65,35 @@ public void setUrl(final String url) {
this.url = url;
}

public String getMetaData() {
return metaData;
public float getWeight() {
return weight;
}

public void setMetaData(final String metaData) {
this.metaData = metaData;
}

@Override
public String toString() {
return "RequestData [method=" + method + ", url=" + url + "]";
public void setWeight(float weight) {
this.weight = weight;
}

@Override
public int hashCode() {
return Objects.hash(metaData, method, url);
return Objects.hash(method, url, weight);
}

@Override
public boolean equals(final Object obj) {
if (this == obj) {
public boolean equals(Object obj) {
if (this == obj)
return true;
}
if (obj == null || getClass() != obj.getClass()) {
if (obj == null)
return false;
}
final RequestData other = (RequestData) obj;
if (!Objects.equals(metaData, other.metaData) || method != other.method || !Objects.equals(url, other.url)) {
if (getClass() != obj.getClass())
return false;
}
return true;
RequestData other = (RequestData) obj;
return method == other.method && Objects.equals(url, other.url)
&& Float.floatToIntBits(weight) == Float.floatToIntBits(other.weight);
}

@Override
public String toString() {
return "RequestData [method=" + method + ", url=" + url + ", weight=" + weight + "]";
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -173,8 +173,9 @@ protected void storeChildUrls(final CrawlerContext crawlerContext, final Set<Req
final int depth, final String encoding) {
// add url and filter
final Set<String> urlSet = new HashSet<>();
final List<UrlQueue<?>> childList = childUrlList.stream().filter(d -> StringUtil.isNotBlank(d.getUrl())
&& urlSet.add(d.getUrl() + "\n" + d.getMetaData()) && crawlerContext.getUrlFilter().match(d.getUrl())).map(d -> {
final List<UrlQueue<?>> childList = childUrlList.stream()
.filter(d -> StringUtil.isNotBlank(d.getUrl()) && urlSet.add(d.getUrl()) && crawlerContext.getUrlFilter().match(d.getUrl()))
.map(d -> {
final UrlQueue<?> uq = crawlerContainer.getComponent("urlQueue");
uq.setCreateTime(SystemUtil.currentTimeMillis());
uq.setDepth(depth);
Expand All @@ -183,7 +184,7 @@ protected void storeChildUrls(final CrawlerContext crawlerContext, final Set<Req
uq.setParentUrl(url);
uq.setSessionId(crawlerContext.getSessionId());
uq.setUrl(d.getUrl());
uq.setMetaData(d.getMetaData());
uq.setWeight(d.getWeight());
return uq;
}).collect(Collectors.toList());

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ public void process(final ResponseData responseData) {
final Set<RequestData> requestDataSet = new LinkedHashSet<>();
for (final Sitemap sitemap : sitemapSet.getSitemaps()) {
if (sitemap != null) {
requestDataSet.add(RequestDataBuilder.newRequestData().get().url(sitemap.getLoc()).build());
requestDataSet.add(RequestDataBuilder.newRequestData().get().url(sitemap.getLoc()).build()); // TODO priority
}
}
throw new ChildUrlsException(requestDataSet, this.getClass().getName() + "#process");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,6 @@ public void add(final String sessionId, final String url) {
urlQueue.setSessionId(sessionId);
urlQueue.setMethod(Constants.GET_METHOD);
urlQueue.setUrl(url);
urlQueue.setUrl(url);
urlQueue.setDepth(0);
urlQueue.setCreateTime(SystemUtil.currentTimeMillis());
urlQueueList.add(urlQueue);
Expand Down Expand Up @@ -215,12 +214,13 @@ public void generateUrlQueues(final String previousSessionId, final String sessi
for (final Map.Entry<String, AccessResultImpl<Long>> entry : arMap.entrySet()) {
synchronized (urlQueueList) {
final UrlQueueImpl<Long> urlQueue = new UrlQueueImpl<>();
final AccessResultImpl<Long> value = entry.getValue();
urlQueue.setSessionId(sessionId);
urlQueue.setMethod(entry.getValue().getMethod());
urlQueue.setUrl(entry.getValue().getUrl());
urlQueue.setParentUrl(entry.getValue().getParentUrl());
urlQueue.setMethod(value.getMethod());
urlQueue.setUrl(value.getUrl());
urlQueue.setParentUrl(value.getParentUrl());
urlQueue.setDepth(0);
urlQueue.setLastModified(entry.getValue().getLastModified());
urlQueue.setLastModified(value.getLastModified());
urlQueue.setCreateTime(SystemUtil.currentTimeMillis());
urlQueueList.add(urlQueue);
}
Expand Down

0 comments on commit 605d287

Please sign in to comment.