Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add encoding.iconv #22332

Merged
merged 30 commits into from
Oct 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
afa6d92
add encoding.iconv
kbkpbot Sep 27, 2024
ee19d88
fix typo
kbkpbot Sep 27, 2024
f6e6fe7
add iconv.h
kbkpbot Sep 27, 2024
e4e62fb
fix;add CP_OEMCP
kbkpbot Sep 28, 2024
3520478
use []u8 as func interface
kbkpbot Sep 28, 2024
bf8890c
Update vlib/encoding/iconv/iconv.v
spytheman Sep 28, 2024
95199c7
ci: skip iconv_test.v on `ubuntu-docker-musl`; run `brew install libi…
spytheman Sep 28, 2024
30dd17f
add `#flag darwin -liconv` to iconv_nix.c.v
spytheman Sep 29, 2024
a617594
Merge branch 'vlang:master' into encoding.iconv
kbkpbot Sep 30, 2024
2ae9f64
remove UTF16LE test on CI macos
kbkpbot Sep 30, 2024
06bd6f6
Merge branch 'vlang:master' into encoding.iconv
kbkpbot Sep 30, 2024
462da8e
Merge branch 'vlang:master' into encoding.iconv
kbkpbot Sep 30, 2024
87a0898
revert to &u8 interface, fix linux BOM header, change all codepage na…
kbkpbot Sep 30, 2024
5c71940
handle endian
kbkpbot Sep 30, 2024
b743462
more detect for UTF16,UTF32
kbkpbot Oct 1, 2024
7595e4e
fix test, as macos always use big-endian for UTF16,UTF32
kbkpbot Oct 1, 2024
e15d88c
performance hack for UTF-8; empty string support
kbkpbot Oct 1, 2024
6e5a8b5
add helper funcs: create_utf_string_with_bom,write_file_utf_string_wi…
kbkpbot Oct 1, 2024
d7294d4
add new funcs read_file_encoding/write_file_encoding; add windows UTF…
kbkpbot Oct 1, 2024
2e3a399
remove UTF16/UTF32 support, as they will cause confuse
kbkpbot Oct 1, 2024
b6325a9
add ANSI on windows
kbkpbot Oct 1, 2024
203abd6
Update vlib/encoding/iconv/iconv.v
spytheman Oct 1, 2024
b1d10e6
Update vlib/encoding/iconv/iconv.v
spytheman Oct 1, 2024
8350756
Merge branch 'vlang:master' into encoding.iconv
kbkpbot Oct 1, 2024
e4dadc2
add read/write file test
kbkpbot Oct 2, 2024
ec4e5a3
workaroud for macos-12 not support UTF16LE?
kbkpbot Oct 2, 2024
d8294bb
try fix macos-12 fail on UTF16LE
kbkpbot Oct 2, 2024
6264a00
debug
kbkpbot Oct 2, 2024
f6679c3
debug
kbkpbot Oct 2, 2024
d25f9e5
as macos-12 has no UTF16LE, just UTF-16LE.... :(
kbkpbot Oct 2, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .github/workflows/macos_ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,8 @@ jobs:
echo $VFLAGS
v cmd/tools/test_if_v_test_system_works.v
./cmd/tools/test_if_v_test_system_works
- name: Install iconv for encoding.iconv
run: brew install libiconv
- name: Test pure V math module
run: v -exclude @vlib/math/*.c.v test vlib/math/
- name: Self tests
Expand Down
1 change: 1 addition & 0 deletions cmd/tools/modules/testing/common.v
Original file line number Diff line number Diff line change
Expand Up @@ -280,6 +280,7 @@ pub fn new_test_session(_vargs string, will_compile bool) TestSession {
skip_files << 'vlib/net/openssl/openssl_compiles_test.c.v'
skip_files << 'vlib/crypto/ecdsa/ecdsa_test.v'
skip_files << 'vlib/x/ttf/ttf_test.v'
skip_files << 'vlib/encoding/iconv/iconv_test.v' // needs libiconv to be installed
}
if github_job == 'tests-sanitize-memory-clang' {
skip_files << 'vlib/net/openssl/openssl_compiles_test.c.v'
Expand Down
144 changes: 144 additions & 0 deletions vlib/encoding/iconv/iconv.v
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
module iconv

// Module iconv provides functions to convert between vstring(UTF8) and other encodings.
import os

@[inline]
fn reverse_u16(src u16) u16 {
return u16(src >> 8 | src << 8)
}

@[inline]
fn reverse_u32(src u32) u32 {
return u32(src >> 24 | ((src >> 8) & 0x0000_FF00) | ((src << 8) & 0x00FF_0000) | src << 24)
}

// vstring_to_encoding convert V string `str` to `tocode` encoding string
// tips: use `iconv --list` check for supported encodings
pub fn vstring_to_encoding(str string, tocode string) ![]u8 {
encoding_name := tocode.to_upper()
if encoding_name in ['UTF16', 'UTF32', 'UTF-16', 'UTF-32']! {
return error('please use UTF16-LE/UTF-16BE/UTF-32LE/UTF-32BE instead')
}
return conv(tocode, 'UTF-8', str.str, str.len)
}

// encoding_to_vstring converts the given `bytes` using `fromcode` encoding, to a V string (encoded with UTF-8)
// tips: use `iconv --list` check for supported encodings
pub fn encoding_to_vstring(bytes []u8, fromcode string) !string {
encoding_name := fromcode.to_upper()
if encoding_name in ['UTF16', 'UTF32', 'UTF-16', 'UTF-32']! {
return error('please use UTF16-LE/UTF-16BE/UTF-32LE/UTF-32BE instead')
}
mut dst := conv('UTF-8', fromcode, bytes.data, bytes.len)!
dst << 0 // add a tail zero, to build a vstring
return unsafe { cstring_to_vstring(dst.data) }
}

// create_utf_string_with_bom will create a utf8/utf16/utf32 string with BOM header
// for utf8, it will prepend 0xEFBBBF to the `src`
// for utf16le, it will prepend 0xFFFE to the `src`
// for utf16be, it will prepend 0xFEFF to the `src`
// for utf32le, it will prepend 0xFFFE0000 to the `src`
// for utf32be, it will prepend 0x0000FEFF to the `src`
pub fn create_utf_string_with_bom(src []u8, utf_type string) []u8 {
mut clone := src.clone()
match utf_type.to_upper() {
'UTF8', 'UTF-8' {
clone.prepend([u8(0xEF), 0xBB, 0xBF])
}
'UTF16LE', 'UTF-16LE' {
clone.prepend([u8(0xFF), 0xFE])
}
'UTF16BE', 'UTF-16BE' {
clone.prepend([u8(0xFE), 0xFF])
}
'UTF32LE', 'UTF-32LE' {
clone.prepend([u8(0xFF), 0xFE, 0, 0])
}
'UTF32BE', 'UTF-32BE' {
clone.prepend([u8(0), 0, 0xFE, 0xFF])
}
else {}
}
return clone
}

// remove_utf_string_with_bom will remove a utf8/utf16/utf32 string's BOM header
// for utf8, it will remove 0xEFBBBF from the `src`
// for utf16le, it will remove 0xFFFE from the `src`
// for utf16be, it will remove 0xFEFF from the `src`
// for utf32le, it will remove 0xFFFE0000 from the `src`
// for utf32be, it will remove 0x0000FEFF from the `src`
@[direct_array_access]
pub fn remove_utf_string_with_bom(src []u8, utf_type string) []u8 {
mut clone := src.clone()
match utf_type.to_upper() {
'UTF8', 'UTF-8' {
if clone.len > 3 {
if clone[0] == u8(0xEF) && clone[1] == u8(0xBB) && clone[2] == u8(0xBF) {
clone.delete_many(0, 3)
}
}
}
'UTF16LE', 'UTF-16LE' {
if clone.len > 2 {
if clone[0] == u8(0xFF) && clone[1] == u8(0xFE) {
clone.delete_many(0, 2)
}
}
}
'UTF16BE', 'UTF-16BE' {
if clone.len > 2 {
if clone[0] == u8(0xFE) && clone[1] == u8(0xFF) {
clone.delete_many(0, 2)
}
}
}
'UTF32LE', 'UTF-32LE' {
if clone.len > 4 {
if clone[0] == u8(0xFF) && clone[1] == u8(0xFE) && clone[2] == u8(0)
&& clone[3] == u8(0) {
clone.delete_many(0, 4)
}
}
}
'UTF32BE', 'UTF-32BE' {
if clone.len > 4 {
if clone[0] == u8(0) && clone[1] == u8(0) && clone[2] == u8(0xFE)
&& clone[3] == u8(0xFF) {
clone.delete_many(0, 4)
}
}
}
else {}
}
return clone
}

// write_file_encoding write_file convert `text` into `encoding` and writes to a file with the given `path`. If `path` already exists, it will be overwritten.
// For `encoding` in UTF8/UTF16/UTF32, if `bom` is true, then a BOM header will write to the file.
pub fn write_file_encoding(path string, text string, encoding string, bom bool) ! {
encoding_name := encoding.to_upper()
if encoding_name in ['UTF16', 'UTF32', 'UTF-16', 'UTF-32']! {
return error('please use UTF-16LE/UTF-16BE/UTF-32LE/UTF-32BE instead')
}
encoding_bytes := vstring_to_encoding(text, encoding)!
if bom && encoding.to_upper().starts_with('UTF') {
encoding_bom_bytes := create_utf_string_with_bom(encoding_bytes, encoding)
os.write_file_array(path, encoding_bom_bytes)!
} else {
os.write_file_array(path, encoding_bytes)!
}
}

// read_file_encoding reads the file in `path` with `encoding` and returns the contents
pub fn read_file_encoding(path string, encoding string) !string {
encoding_name := encoding.to_upper()
if encoding_name in ['UTF16', 'UTF32', 'UTF-16', 'UTF-32']! {
return error('please use UTF-16LE/UTF-16BE/UTF-32LE/UTF-32BE instead')
}
encoding_bytes := os.read_file_array[u8](path)
encoding_without_bom_bytes := remove_utf_string_with_bom(encoding_bytes, encoding)
return encoding_to_vstring(encoding_without_bom_bytes, encoding)!
}
58 changes: 58 additions & 0 deletions vlib/encoding/iconv/iconv_nix.c.v
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
module iconv

// Module iconv provides functions convert between vstring(UTF8) to/from different encodings.

#include <iconv.h>
#flag darwin -liconv

fn C.iconv_open(tocode &u8, fromcode &u8) voidptr
fn C.iconv_close(cd voidptr) int
fn C.iconv(cd voidptr, inbuf &&u8, inbytesleft &usize, outbuf &&u8, outbytesleft &usize) usize

// conv convert `fromcode` encoding string to `tocode` encoding string
@[direct_array_access]
fn conv(tocode string, fromcode string, src &u8, src_len int) ![]u8 {
if src_len < 0 {
return error('src length error')
}

mut src_encoding := fromcode.to_upper()
mut dst_encoding := tocode.to_upper()

// As macos-12 has no UTF16LE/UTF16BE/UTF32LE/UTF32BE, change them to UTF-16LE/UTF-16BE/UTF-32LE/UTF-32BE
match src_encoding {
'UTF16LE' { src_encoding = 'UTF-16LE' }
'UTF16BE' { src_encoding = 'UTF-16BE' }
'UTF32LE' { src_encoding = 'UTF-32LE' }
'UTF32BE' { src_encoding = 'UTF-32BE' }
else {}
}
match dst_encoding {
'UTF16LE' { dst_encoding = 'UTF-16LE' }
'UTF16BE' { dst_encoding = 'UTF-16BE' }
'UTF32LE' { dst_encoding = 'UTF-32LE' }
'UTF32BE' { dst_encoding = 'UTF-32BE' }
else {}
}

mut cd := C.iconv_open(dst_encoding.str, src_encoding.str)
if isize(cd) == -1 {
return error('platform can\'t convert from ${src_encoding} to ${dst_encoding}')
}
defer { C.iconv_close(cd) }

mut dst := []u8{len: (src_len + 1) * 4} // this should be enough to hold the dst encoding string

mut src_ptr := &u8(src)
mut dst_ptr := &u8(dst.data)
mut src_left := usize(src_len)
mut dst_left := usize(dst.len)
res := C.iconv(cd, &src_ptr, &src_left, &dst_ptr, &dst_left)
if res == usize(-1) {
return error('convert encoding string fail, iconv return ${res}')
}

// resize dst buf to real length
dst.trim(dst.len - int(dst_left))
return dst
}
129 changes: 129 additions & 0 deletions vlib/encoding/iconv/iconv_test.v
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
import encoding.iconv
import os

fn test_vstring_to_encoding() {
empty_utf8 := iconv.vstring_to_encoding('', 'UTF-8')!
assert empty_utf8 == []

abc_utf8 := iconv.vstring_to_encoding('abc', 'UTF-8')!
assert abc_utf8 == [u8(97), 98, 99]

abc_utf16le := iconv.vstring_to_encoding('abc', 'UTF-16LE')!
assert abc_utf16le == [u8(97), 0, 98, 0, 99, 0]

abc_utf16be := iconv.vstring_to_encoding('abc', 'UTF-16BE')!
assert abc_utf16be == [u8(0), 97, 0, 98, 0, 99]

abc_utf32le := iconv.vstring_to_encoding('abc', 'UTF-32LE')!
assert abc_utf32le == [u8(97), 0, 0, 0, 98, 0, 0, 0, 99, 0, 0, 0]

abc_utf32be := iconv.vstring_to_encoding('abc', 'UTF-32BE')!
assert abc_utf32be == [u8(0), 0, 0, 97, 0, 0, 0, 98, 0, 0, 0, 99]

if abc_not_exist := iconv.vstring_to_encoding('abc', 'encoding_not_exist') {
assert false, 'encoding_not_exist'
}

if ch_str := iconv.vstring_to_encoding('V大法好abc', 'GB2312') {
assert ch_str == [u8(86), 180, 243, 183, 168, 186, 195, 97, 98, 99]
} else {
// some platforms do not support GB2312, skip
assert true
}
}

fn test_encoding_to_vstring() {
empty_utf8 := iconv.encoding_to_vstring([], 'UTF-8')!
assert empty_utf8 == ''

abc_utf8 := iconv.encoding_to_vstring([u8(97), 98, 99], 'UTF-8')!
assert abc_utf8 == 'abc'

abc_utf16le := iconv.encoding_to_vstring([u8(97), 0, 98, 0, 99, 0], 'UTF-16LE')!
assert abc_utf16le == 'abc'

abc_utf16be := iconv.encoding_to_vstring([u8(0), 97, 0, 98, 0, 99], 'UTF-16BE')!
assert abc_utf16be == 'abc'

abc_utf32le := iconv.encoding_to_vstring([u8(97), 0, 0, 0, 98, 0, 0, 0, 99, 0, 0, 0],
'UTF-32LE')!
assert abc_utf32le == 'abc'

abc_utf32be := iconv.encoding_to_vstring([u8(0), 0, 0, 97, 0, 0, 0, 98, 0, 0, 0, 99],
'UTF-32BE')!
assert abc_utf32be == 'abc'

if abc_not_exist := iconv.encoding_to_vstring([u8(97), 98, 99], 'encoding_not_exist') {
assert false, 'encoding_not_exist'
}

if ch_str := iconv.encoding_to_vstring([u8(86), 180, 243, 183, 168, 186, 195, 97, 98, 99],
'GB2312')
{
assert ch_str == 'V大法好abc'
} else {
// some platforms do not support GB2312, skip
assert true
}
}

fn test_create_utf_string_with_bom() {
// bug ? vfmt create strange format here
// vfmt off
assert iconv.create_utf_string_with_bom([u8(97), 98, 99], 'UTF-8') == [u8(0xEF), 0xBB, 0xBF, 97, 98, 99]
assert iconv.create_utf_string_with_bom([u8(97), 0, 98, 0, 99, 0], 'UTF-16LE') == [u8(0xFF), 0xFE, 97, 0, 98, 0, 99, 0]
assert iconv.create_utf_string_with_bom([u8(0), 97, 0, 98, 0, 99], 'UTF-16BE') == [u8(0xFE), 0xFF, 0, 97, 0, 98, 0, 99]
assert iconv.create_utf_string_with_bom([u8(97), 0, 0, 0, 98, 0, 0, 0, 99, 0, 0, 0], 'UTF-32LE') == [u8(0xFF), 0xFE, 0, 0, 97, 0, 0, 0, 98, 0, 0, 0, 99, 0, 0, 0]
assert iconv.create_utf_string_with_bom([u8(0), 0, 0, 97, 0, 0, 0, 98, 0, 0, 0, 99], 'UTF-32BE') == [u8(0), 0, 0xFE, 0xFF, 0, 0, 0, 97, 0, 0, 0, 98, 0, 0, 0, 99]
// vfmt on
}

fn test_remove_utf_string_with_bom() {
// bug ? vfmt create strange format here
// vfmt off
assert iconv.remove_utf_string_with_bom([u8(0xEF), 0xBB, 0xBF, 97, 98, 99], 'UTF-8') == [u8(97), 98, 99]
assert iconv.remove_utf_string_with_bom([u8(0xFF), 0xFE, 97, 0, 98, 0, 99, 0], 'UTF-16LE') == [u8(97), 0, 98, 0, 99, 0]
assert iconv.remove_utf_string_with_bom([u8(0xFE), 0xFF, 0, 97, 0, 98, 0, 99], 'UTF-16BE') == [u8(0), 97, 0, 98, 0, 99]
assert iconv.remove_utf_string_with_bom([u8(0xFF), 0xFE, 0, 0, 97, 0, 0, 0, 98, 0, 0, 0, 99, 0, 0, 0], 'UTF-32LE') == [u8(97), 0, 0, 0, 98, 0, 0, 0, 99, 0, 0, 0]
assert iconv.remove_utf_string_with_bom([u8(0), 0, 0xFE, 0xFF, 0, 0, 0, 97, 0, 0, 0, 98, 0, 0, 0, 99], 'UTF-32BE') == [u8(0), 0, 0, 97, 0, 0, 0, 98, 0, 0, 0, 99]
// vfmt on
}

fn my_test_read_file_encoding_write_file_encoding(txt string, encoding string, bom bool, bytes []u8) ! {
iconv.write_file_encoding('iconv_tmp.txt', txt, encoding, bom)!
// read bytes directly from file
mut bytes_ref := os.read_file_array[u8]('iconv_tmp.txt')
assert bytes_ref == bytes
if bom {
bytes_ref = iconv.remove_utf_string_with_bom(bytes_ref, encoding)
}
str_ref := iconv.encoding_to_vstring(bytes_ref, encoding)!
assert str_ref.bytes() == txt.bytes()
str_conv := iconv.read_file_encoding('iconv_tmp.txt', encoding)!
assert str_conv == txt
os.rm('iconv_tmp.txt')!
}

fn test_read_file_encoding_write_file_encoding() ! {
// vfmt off
// UTF-8
my_test_read_file_encoding_write_file_encoding('V大法好abc','UTF-8',false,[u8(86), 229, 164, 167, 230, 179, 149, 229, 165, 189, 97, 98, 99])!
my_test_read_file_encoding_write_file_encoding('V大法好abc','UTF-8',true,[u8(0xEF), 0xBB, 0xBF, 86, 229, 164, 167, 230, 179, 149, 229, 165, 189, 97, 98, 99])!

// UTF-16LE
my_test_read_file_encoding_write_file_encoding('V大法好abc','UTF-16LE',false,[u8(86), 0, 39, 89, 213, 108, 125, 89, 97, 0, 98, 0, 99, 0])!
my_test_read_file_encoding_write_file_encoding('V大法好abc','UTF-16LE',true,[u8(0xFF), 0xFE, 86, 0, 39, 89, 213, 108, 125, 89, 97, 0, 98, 0, 99, 0])!

// UTF-16BE
my_test_read_file_encoding_write_file_encoding('V大法好abc','UTF-16BE',false,[u8(0), 86, 89, 39, 108, 213, 89, 125, 0, 97, 0, 98, 0, 99])!
my_test_read_file_encoding_write_file_encoding('V大法好abc','UTF-16BE',true,[u8(0xFE), 0xFF, 0, 86, 89, 39, 108, 213, 89, 125, 0, 97, 0, 98, 0, 99])!

// UTF-32LE
my_test_read_file_encoding_write_file_encoding('V大法好abc','UTF-32LE',false,[u8(86), 0, 0, 0, 39, 89, 0, 0, 213, 108, 0, 0, 125, 89, 0, 0, 97, 0, 0, 0, 98, 0, 0, 0, 99, 0, 0, 0])!
my_test_read_file_encoding_write_file_encoding('V大法好abc','UTF-32LE',true,[u8(0xFF), 0xFE, 0, 0, 86, 0, 0, 0, 39, 89, 0, 0, 213, 108, 0, 0, 125, 89, 0, 0, 97, 0, 0, 0, 98, 0, 0, 0, 99, 0, 0, 0])!

// UTF-32BE
my_test_read_file_encoding_write_file_encoding('V大法好abc','UTF-32BE',false,[u8(0), 0, 0, 86, 0, 0, 89, 39, 0, 0, 108, 213, 0, 0, 89, 125, 0, 0, 0, 97, 0, 0, 0, 98, 0, 0, 0, 99])!
my_test_read_file_encoding_write_file_encoding('V大法好abc','UTF-32BE',true,[u8(0), 0, 0xFE, 0xFF, 0, 0, 0, 86, 0, 0, 89, 39, 0, 0, 108, 213, 0, 0, 89, 125, 0, 0, 0, 97, 0, 0, 0, 98, 0, 0, 0, 99])!
// vfmt on
}
Loading
Loading