-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
2 changed files
with
78 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,77 @@ | ||
# Java流式转换超大文本的字符编码 | ||
|
||
使用powershell导出的mysqldump是utf-16 le的格式的文本。 | ||
在linux上需要使用utf-8的文本导入。这是本程序的由来。 | ||
|
||
程序使用java实现尽量使用buffered的io实现,可调节bufferSize,流式的读取文件,将原来的数据重新编码后写入新文件。 | ||
|
||
```java | ||
import java.io.*; | ||
import java.nio.charset.Charset; | ||
import java.nio.charset.CharsetEncoder; | ||
import java.nio.charset.StandardCharsets; | ||
import java.nio.file.Files; | ||
import java.nio.file.Path; | ||
import java.nio.file.Paths; | ||
import java.nio.file.StandardOpenOption; | ||
import java.util.concurrent.atomic.AtomicLong; | ||
import java.util.stream.Stream; | ||
|
||
/** | ||
* <p>Title: </p> | ||
* <p>Description:主要功能:流式的转换文件文件的编码,适用于超大型文本文件的处理 | ||
* 测试使用321GB 29万行 的mysqldump文件(utf-16 le),在增大bufferSize的时候可以用200MB每秒的速度写入(utf-8) | ||
* 写入后 占用空间 169GB | ||
* </p> | ||
* | ||
* @author whp98 | ||
*/ | ||
public class FileEncodeChanger { | ||
public static final String UTF8_BOM = "\uFEFF"; | ||
//实测使用 8192 * 100000 会占用大约6G内存 | ||
public static int bufferSize = 8192 * 100000; | ||
|
||
public static void convertEncoding(Path sourceFile, Path targetFile, Charset sourceCharset, Charset targetCharset) throws IOException { | ||
CharsetEncoder encoder = targetCharset.newEncoder(); | ||
try (Stream<String> lines = Files.lines(sourceFile, sourceCharset); | ||
Writer writerOp = new OutputStreamWriter(Files.newOutputStream(targetFile, StandardOpenOption.CREATE, StandardOpenOption.WRITE), encoder); | ||
BufferedWriter writer = new BufferedWriter(writerOp, bufferSize) | ||
) { | ||
AtomicLong currentLine = new AtomicLong(0L); | ||
lines.forEach(line -> { | ||
try { | ||
currentLine.getAndIncrement(); | ||
if (currentLine.get() == 1) { | ||
if (line.startsWith(UTF8_BOM)) { | ||
line = line.substring(1); | ||
} | ||
writer.write(line); | ||
} else { | ||
writer.write(line); | ||
} | ||
if (currentLine.get() % 10000 == 0) { | ||
System.out.printf("write %s", (currentLine.get() / 10000) + "万 \n"); | ||
} | ||
writer.newLine(); | ||
} catch (IOException e) { | ||
throw new UncheckedIOException(e); | ||
} | ||
}); | ||
} | ||
} | ||
|
||
public static void main(String[] args) { | ||
Path sourceFile = Paths.get("D:\\mysql57\\mysql-5.7.31-winx64\\bin\\all231101.sql"); // 修改为实际文件路径 | ||
Path targetFile = Paths.get("F:\\all231101-utf8.sql"); // 修改为实际文件路径 | ||
Charset sourceCharset = StandardCharsets.UTF_16LE; // 源文件编码 | ||
Charset targetCharset = StandardCharsets.UTF_8; // 目标文件编码 | ||
try { | ||
convertEncoding(sourceFile, targetFile, sourceCharset, targetCharset); | ||
System.out.println("文件编码转换完成!"); | ||
} catch (IOException e) { | ||
throw new RuntimeException("文件操作失败:" + e.getMessage(), e); | ||
} | ||
} | ||
} | ||
|
||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters