Skip to content

Commit

Permalink
fix: Incorrect character encoding detected for long text (#1871)
Browse files Browse the repository at this point in the history
  • Loading branch information
robinshine committed Apr 26, 2024
1 parent 4b157ac commit d2dc663
Show file tree
Hide file tree
Showing 2 changed files with 104 additions and 185 deletions.
211 changes: 104 additions & 107 deletions server-core/src/main/java/io/onedev/server/util/ContentDetector.java
Original file line number Diff line number Diff line change
@@ -1,107 +1,104 @@
package io.onedev.server.util;

import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.Charset;

import javax.annotation.Nullable;

import org.apache.tika.Tika;
import org.apache.tika.mime.MediaType;

public class ContentDetector {

private static final Tika tika = new Tika();

/**
* Read leading information of specified stream until the charset is detected.
*
* @param contentStream
* stream to be read for charset detection
* @return
* detected charset, or <tt>null</tt> if charset can not be detected
*/
public static @Nullable Charset detectCharset(InputStream contentStream) {
try {
return UniversalEncodingDetector.detect(contentStream);
} catch (IOException e) {
throw new RuntimeException(e);
}
}

/**
* Read leading information of specified content bytes to detect content charset.
*
* @param contentBytes
* content to be detected
* @return
* charset of the content, or <tt>null</tt> if charset can not be detected
*/
public static @Nullable Charset detectCharset(byte[] contentBytes) {
if (contentBytes.length != 0) {
try {
return UniversalEncodingDetector.detect(contentBytes);
} catch (IOException e) {
throw new RuntimeException(e);
}
} else {
return null;
}
}

public static boolean isBinary(byte[] contentBytes, @Nullable String fileName) {
if (contentBytes.length == 0)
return false;

MediaType mediaType = detectMediaType(contentBytes, fileName);

return !mediaType.getType().equalsIgnoreCase("text")
&& !mediaType.equals(MediaType.application("rls-services+xml"))
&& !mediaType.equals(MediaType.application("xhtml+xml"))
&& !mediaType.equals(MediaType.APPLICATION_XML)
&& !mediaType.equals(MediaType.application("x-bat"))
&& !mediaType.equals(MediaType.application("x-tex"))
&& !mediaType.equals(MediaType.application("json"))
&& !mediaType.equals(MediaType.application("x-sh"))
&& !mediaType.equals(MediaType.application("javascript"))
&& !mediaType.equals(MediaType.application("x-httpd-jsp"))
&& !mediaType.equals(MediaType.application("x-httpd-php"));
}

/**
* Get text from specified content bytes, optionally with help of file name.
*
* @param contentBytes
* content bytes to construct text from
* @param fileName
* file name to help deciding if supplied content bytes represents text
* @return
* text representation of content bytes, or <tt>null</tt> if content
* can not be converted to text
*/
@Nullable
public static String convertToText(byte[] contentBytes, @Nullable String fileName) {
if (!isBinary(contentBytes, fileName)) {
Charset charset = detectCharset(contentBytes);
if (charset != null)
return new String(contentBytes, charset);
else
return new String(contentBytes);
} else {
return null;
}
}

public static MediaType detectMediaType(byte[] contentBytes, @Nullable String fileName) {
return MediaType.parse(tika.detect(contentBytes, fileName));
}

public static MediaType detectMediaType(InputStream contentStream, @Nullable String fileName) {
try {
return MediaType.parse(tika.detect(contentStream, fileName));
} catch (IOException e) {
throw new RuntimeException(e);
}
}

}
package io.onedev.server.util;

import org.apache.tika.Tika;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;

import javax.annotation.Nullable;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.Charset;

public class ContentDetector {

private static final Tika tika = new Tika();

/**
* Read leading information of specified content bytes to detect content charset.
*
* @param bytes
* content to be detected
* @return
* charset of the content, or <tt>null</tt> if charset can not be detected
*/
@Nullable
public static Charset detectCharset(byte[] bytes) {
if (bytes.length != 0) {
var listener = new UniversalEncodingListener(new Metadata());
var pos = 0;
var lookAhead = 1024;
while (true) {
var left = bytes.length - pos;
if (left < lookAhead) {
listener.handleData(bytes, pos, left);
break;
} else {
listener.handleData(bytes, pos, lookAhead);
if (listener.isDone())
break;
else
pos += lookAhead;
}
}
return listener.dataEnd();
} else {
return null;
}
}

public static boolean isBinary(byte[] contentBytes, @Nullable String fileName) {
if (contentBytes.length == 0)
return false;

MediaType mediaType = detectMediaType(contentBytes, fileName);

return !mediaType.getType().equalsIgnoreCase("text")
&& !mediaType.equals(MediaType.application("rls-services+xml"))
&& !mediaType.equals(MediaType.application("xhtml+xml"))
&& !mediaType.equals(MediaType.APPLICATION_XML)
&& !mediaType.equals(MediaType.application("x-bat"))
&& !mediaType.equals(MediaType.application("x-tex"))
&& !mediaType.equals(MediaType.application("json"))
&& !mediaType.equals(MediaType.application("x-sh"))
&& !mediaType.equals(MediaType.application("javascript"))
&& !mediaType.equals(MediaType.application("x-httpd-jsp"))
&& !mediaType.equals(MediaType.application("x-httpd-php"));
}

/**
* Get text from specified content bytes, optionally with help of file name.
*
* @param contentBytes
* content bytes to construct text from
* @param fileName
* file name to help deciding if supplied content bytes represents text
* @return
* text representation of content bytes, or <tt>null</tt> if content
* can not be converted to text
*/
@Nullable
public static String convertToText(byte[] contentBytes, @Nullable String fileName) {
if (!isBinary(contentBytes, fileName)) {
Charset charset = detectCharset(contentBytes);
if (charset != null)
return new String(contentBytes, charset);
else
return new String(contentBytes);
} else {
return null;
}
}

public static MediaType detectMediaType(byte[] contentBytes, @Nullable String fileName) {
return MediaType.parse(tika.detect(contentBytes, fileName));
}

public static MediaType detectMediaType(InputStream contentStream, @Nullable String fileName) {
try {
return MediaType.parse(tika.detect(contentStream, fileName));
} catch (IOException e) {
throw new RuntimeException(e);
}
}

}

This file was deleted.

0 comments on commit d2dc663

Please sign in to comment.