vlang · spytheman · Oct 2, 2024 · Sep 27, 2024 · Sep 27, 2024 · Sep 27, 2024
diff --git a/.github/workflows/macos_ci.yml b/.github/workflows/macos_ci.yml
@@ -63,6 +63,8 @@ jobs:
           echo $VFLAGS
           v cmd/tools/test_if_v_test_system_works.v
           ./cmd/tools/test_if_v_test_system_works
+      - name: Install iconv for encoding.iconv
+        run: brew install libiconv
       - name: Test pure V math module
         run: v -exclude @vlib/math/*.c.v test vlib/math/
       - name: Self tests

diff --git a/cmd/tools/modules/testing/common.v b/cmd/tools/modules/testing/common.v
@@ -280,6 +280,7 @@ pub fn new_test_session(_vargs string, will_compile bool) TestSession {
 			skip_files << 'vlib/net/openssl/openssl_compiles_test.c.v'
 			skip_files << 'vlib/crypto/ecdsa/ecdsa_test.v'
 			skip_files << 'vlib/x/ttf/ttf_test.v'
+			skip_files << 'vlib/encoding/iconv/iconv_test.v' // needs libiconv to be installed
 		}
 		if github_job == 'tests-sanitize-memory-clang' {
 			skip_files << 'vlib/net/openssl/openssl_compiles_test.c.v'

diff --git a/vlib/encoding/iconv/iconv.v b/vlib/encoding/iconv/iconv.v
@@ -0,0 +1,144 @@
+module iconv
+
+// Module iconv provides functions to convert between vstring(UTF8) and other encodings.
+import os
+
+@[inline]
+fn reverse_u16(src u16) u16 {
+	return u16(src >> 8 | src << 8)
+}
+
+@[inline]
+fn reverse_u32(src u32) u32 {
+	return u32(src >> 24 | ((src >> 8) & 0x0000_FF00) | ((src << 8) & 0x00FF_0000) | src << 24)
+}
+
+// vstring_to_encoding convert V string `str` to `tocode` encoding string
+// tips: use `iconv --list` check for supported encodings
+pub fn vstring_to_encoding(str string, tocode string) ![]u8 {
+	encoding_name := tocode.to_upper()
+	if encoding_name in ['UTF16', 'UTF32', 'UTF-16', 'UTF-32']! {
+		return error('please use UTF16-LE/UTF-16BE/UTF-32LE/UTF-32BE instead')
+	}
+	return conv(tocode, 'UTF-8', str.str, str.len)
+}
+
+// encoding_to_vstring converts the given `bytes` using `fromcode` encoding, to a V string (encoded with UTF-8)
+// tips: use `iconv --list` check for supported encodings
+pub fn encoding_to_vstring(bytes []u8, fromcode string) !string {
+	encoding_name := fromcode.to_upper()
+	if encoding_name in ['UTF16', 'UTF32', 'UTF-16', 'UTF-32']! {
+		return error('please use UTF16-LE/UTF-16BE/UTF-32LE/UTF-32BE instead')
+	}
+	mut dst := conv('UTF-8', fromcode, bytes.data, bytes.len)!
+	dst << 0 // add a tail zero, to build a vstring
+	return unsafe { cstring_to_vstring(dst.data) }
+}
+
+// create_utf_string_with_bom will create a utf8/utf16/utf32 string with BOM header
+// for utf8, it will prepend 0xEFBBBF to the `src`
+// for utf16le, it will prepend 0xFFFE to the `src`
+// for utf16be, it will prepend 0xFEFF to the `src`
+// for utf32le, it will prepend 0xFFFE0000 to the `src`
+// for utf32be, it will prepend 0x0000FEFF to the `src`
+pub fn create_utf_string_with_bom(src []u8, utf_type string) []u8 {
+	mut clone := src.clone()
+	match utf_type.to_upper() {
+		'UTF8', 'UTF-8' {
+			clone.prepend([u8(0xEF), 0xBB, 0xBF])
+		}
+		'UTF16LE', 'UTF-16LE' {
+			clone.prepend([u8(0xFF), 0xFE])
+		}
+		'UTF16BE', 'UTF-16BE' {
+			clone.prepend([u8(0xFE), 0xFF])
+		}
+		'UTF32LE', 'UTF-32LE' {
+			clone.prepend([u8(0xFF), 0xFE, 0, 0])
+		}
+		'UTF32BE', 'UTF-32BE' {
+			clone.prepend([u8(0), 0, 0xFE, 0xFF])
+		}
+		else {}
+	}
+	return clone
+}
+
+// remove_utf_string_with_bom will remove a utf8/utf16/utf32 string's BOM header
+// for utf8, it will remove 0xEFBBBF from the `src`
+// for utf16le, it will remove 0xFFFE from the `src`
+// for utf16be, it will remove 0xFEFF from the `src`
+// for utf32le, it will remove 0xFFFE0000 from the `src`
+// for utf32be, it will remove 0x0000FEFF from the `src`
+@[direct_array_access]
+pub fn remove_utf_string_with_bom(src []u8, utf_type string) []u8 {
+	mut clone := src.clone()
+	match utf_type.to_upper() {
+		'UTF8', 'UTF-8' {
+			if clone.len > 3 {
+				if clone[0] == u8(0xEF) && clone[1] == u8(0xBB) && clone[2] == u8(0xBF) {
+					clone.delete_many(0, 3)
+				}
+			}
+		}
+		'UTF16LE', 'UTF-16LE' {
+			if clone.len > 2 {
+				if clone[0] == u8(0xFF) && clone[1] == u8(0xFE) {
+					clone.delete_many(0, 2)
+				}
+			}
+		}
+		'UTF16BE', 'UTF-16BE' {
+			if clone.len > 2 {
+				if clone[0] == u8(0xFE) && clone[1] == u8(0xFF) {
+					clone.delete_many(0, 2)
+				}
+			}
+		}
+		'UTF32LE', 'UTF-32LE' {
+			if clone.len > 4 {
+				if clone[0] == u8(0xFF) && clone[1] == u8(0xFE) && clone[2] == u8(0)
+					&& clone[3] == u8(0) {
+					clone.delete_many(0, 4)
+				}
+			}
+		}
+		'UTF32BE', 'UTF-32BE' {
+			if clone.len > 4 {
+				if clone[0] == u8(0) && clone[1] == u8(0) && clone[2] == u8(0xFE)
+					&& clone[3] == u8(0xFF) {
+					clone.delete_many(0, 4)
+				}
+			}
+		}
+		else {}
+	}
+	return clone
+}
+
+// write_file_encoding write_file convert `text` into `encoding` and writes to a file with the given `path`. If `path` already exists, it will be overwritten.
+// For `encoding` in UTF8/UTF16/UTF32, if `bom` is true, then a BOM header will write to the file.
+pub fn write_file_encoding(path string, text string, encoding string, bom bool) ! {
+	encoding_name := encoding.to_upper()
+	if encoding_name in ['UTF16', 'UTF32', 'UTF-16', 'UTF-32']! {
+		return error('please use UTF-16LE/UTF-16BE/UTF-32LE/UTF-32BE instead')
+	}
+	encoding_bytes := vstring_to_encoding(text, encoding)!
+	if bom && encoding.to_upper().starts_with('UTF') {
+		encoding_bom_bytes := create_utf_string_with_bom(encoding_bytes, encoding)
+		os.write_file_array(path, encoding_bom_bytes)!
+	} else {
+		os.write_file_array(path, encoding_bytes)!
+	}
+}
+
+// read_file_encoding reads the file in `path` with `encoding` and returns the contents
+pub fn read_file_encoding(path string, encoding string) !string {
+	encoding_name := encoding.to_upper()
+	if encoding_name in ['UTF16', 'UTF32', 'UTF-16', 'UTF-32']! {
+		return error('please use UTF-16LE/UTF-16BE/UTF-32LE/UTF-32BE instead')
+	}
+	encoding_bytes := os.read_file_array[u8](path)
+	encoding_without_bom_bytes := remove_utf_string_with_bom(encoding_bytes, encoding)
+	return encoding_to_vstring(encoding_without_bom_bytes, encoding)!
+}
diff --git a/vlib/encoding/iconv/iconv_nix.c.v b/vlib/encoding/iconv/iconv_nix.c.v
@@ -0,0 +1,58 @@
+module iconv
+
+// Module iconv provides functions convert between vstring(UTF8) to/from different encodings.
+
+#include <iconv.h>
+#flag darwin -liconv
+
+fn C.iconv_open(tocode &u8, fromcode &u8) voidptr
+fn C.iconv_close(cd voidptr) int
+fn C.iconv(cd voidptr, inbuf &&u8, inbytesleft &usize, outbuf &&u8, outbytesleft &usize) usize
+
+// conv convert `fromcode` encoding string to `tocode` encoding string
+@[direct_array_access]
+fn conv(tocode string, fromcode string, src &u8, src_len int) ![]u8 {
+	if src_len < 0 {
+		return error('src length error')
+	}
+
+	mut src_encoding := fromcode.to_upper()
+	mut dst_encoding := tocode.to_upper()
+
+	// As macos-12 has no UTF16LE/UTF16BE/UTF32LE/UTF32BE, change them to UTF-16LE/UTF-16BE/UTF-32LE/UTF-32BE
+	match src_encoding {
+		'UTF16LE' { src_encoding = 'UTF-16LE' }
+		'UTF16BE' { src_encoding = 'UTF-16BE' }
+		'UTF32LE' { src_encoding = 'UTF-32LE' }
+		'UTF32BE' { src_encoding = 'UTF-32BE' }
+		else {}
+	}
+	match dst_encoding {
+		'UTF16LE' { dst_encoding = 'UTF-16LE' }
+		'UTF16BE' { dst_encoding = 'UTF-16BE' }
+		'UTF32LE' { dst_encoding = 'UTF-32LE' }
+		'UTF32BE' { dst_encoding = 'UTF-32BE' }
+		else {}
+	}
+
+	mut cd := C.iconv_open(dst_encoding.str, src_encoding.str)
+	if isize(cd) == -1 {
+		return error('platform can\'t convert from ${src_encoding} to ${dst_encoding}')
+	}
+	defer { C.iconv_close(cd) }
+
+	mut dst := []u8{len: (src_len + 1) * 4} // this should be enough to hold the dst encoding string
+
+	mut src_ptr := &u8(src)
+	mut dst_ptr := &u8(dst.data)
+	mut src_left := usize(src_len)
+	mut dst_left := usize(dst.len)
+	res := C.iconv(cd, &src_ptr, &src_left, &dst_ptr, &dst_left)
+	if res == usize(-1) {
+		return error('convert encoding string fail, iconv return ${res}')
+	}
+
+	// resize dst buf to real length
+	dst.trim(dst.len - int(dst_left))
+	return dst
+}
diff --git a/vlib/encoding/iconv/iconv_test.v b/vlib/encoding/iconv/iconv_test.v
@@ -0,0 +1,129 @@
+import encoding.iconv
+import os
+
+fn test_vstring_to_encoding() {
+	empty_utf8 := iconv.vstring_to_encoding('', 'UTF-8')!
+	assert empty_utf8 == []
+
+	abc_utf8 := iconv.vstring_to_encoding('abc', 'UTF-8')!
+	assert abc_utf8 == [u8(97), 98, 99]
+
+	abc_utf16le := iconv.vstring_to_encoding('abc', 'UTF-16LE')!
+	assert abc_utf16le == [u8(97), 0, 98, 0, 99, 0]
+
+	abc_utf16be := iconv.vstring_to_encoding('abc', 'UTF-16BE')!
+	assert abc_utf16be == [u8(0), 97, 0, 98, 0, 99]
+
+	abc_utf32le := iconv.vstring_to_encoding('abc', 'UTF-32LE')!
+	assert abc_utf32le == [u8(97), 0, 0, 0, 98, 0, 0, 0, 99, 0, 0, 0]
+
+	abc_utf32be := iconv.vstring_to_encoding('abc', 'UTF-32BE')!
+	assert abc_utf32be == [u8(0), 0, 0, 97, 0, 0, 0, 98, 0, 0, 0, 99]
+
+	if abc_not_exist := iconv.vstring_to_encoding('abc', 'encoding_not_exist') {
+		assert false, 'encoding_not_exist'
+	}
+
+	if ch_str := iconv.vstring_to_encoding('V大法好abc', 'GB2312') {
+		assert ch_str == [u8(86), 180, 243, 183, 168, 186, 195, 97, 98, 99]
+	} else {
+		// some platforms do not support GB2312, skip
+		assert true
+	}
+}
+
+fn test_encoding_to_vstring() {
+	empty_utf8 := iconv.encoding_to_vstring([], 'UTF-8')!
+	assert empty_utf8 == ''
+
+	abc_utf8 := iconv.encoding_to_vstring([u8(97), 98, 99], 'UTF-8')!
+	assert abc_utf8 == 'abc'
+
+	abc_utf16le := iconv.encoding_to_vstring([u8(97), 0, 98, 0, 99, 0], 'UTF-16LE')!
+	assert abc_utf16le == 'abc'
+
+	abc_utf16be := iconv.encoding_to_vstring([u8(0), 97, 0, 98, 0, 99], 'UTF-16BE')!
+	assert abc_utf16be == 'abc'
+
+	abc_utf32le := iconv.encoding_to_vstring([u8(97), 0, 0, 0, 98, 0, 0, 0, 99, 0, 0, 0],
+		'UTF-32LE')!
+	assert abc_utf32le == 'abc'
+
+	abc_utf32be := iconv.encoding_to_vstring([u8(0), 0, 0, 97, 0, 0, 0, 98, 0, 0, 0, 99],
+		'UTF-32BE')!
+	assert abc_utf32be == 'abc'
+
+	if abc_not_exist := iconv.encoding_to_vstring([u8(97), 98, 99], 'encoding_not_exist') {
+		assert false, 'encoding_not_exist'
+	}
+
+	if ch_str := iconv.encoding_to_vstring([u8(86), 180, 243, 183, 168, 186, 195, 97, 98, 99],
+		'GB2312')
+	{
+		assert ch_str == 'V大法好abc'
+	} else {
+		// some platforms do not support GB2312, skip
+		assert true
+	}
+}
+
+fn test_create_utf_string_with_bom() {
+	// bug ? vfmt create strange format here
+	// vfmt off
+	assert iconv.create_utf_string_with_bom([u8(97), 98, 99], 'UTF-8') == [u8(0xEF), 0xBB, 0xBF,	97, 98, 99]
+	assert iconv.create_utf_string_with_bom([u8(97), 0, 98, 0, 99, 0], 'UTF-16LE') == [u8(0xFF),	0xFE, 97, 0, 98, 0, 99, 0]
+	assert iconv.create_utf_string_with_bom([u8(0), 97, 0, 98, 0, 99], 'UTF-16BE') == [u8(0xFE), 0xFF, 0, 97, 0, 98, 0, 99]
+	assert iconv.create_utf_string_with_bom([u8(97), 0, 0, 0, 98, 0, 0, 0, 99, 0, 0, 0], 'UTF-32LE') == [u8(0xFF), 0xFE, 0, 0, 97, 0, 0, 0, 98, 0, 0, 0, 99, 0, 0, 0]
+	assert iconv.create_utf_string_with_bom([u8(0), 0, 0, 97, 0, 0, 0, 98, 0, 0, 0, 99], 'UTF-32BE') == [u8(0), 0, 0xFE, 0xFF, 0, 0, 0, 97, 0, 0, 0, 98, 0, 0, 0, 99]
+	// vfmt on
+}
+
+fn test_remove_utf_string_with_bom() {
+	// bug ? vfmt create strange format here
+	// vfmt off
+	assert iconv.remove_utf_string_with_bom([u8(0xEF), 0xBB, 0xBF, 97, 98, 99], 'UTF-8') == [u8(97), 98, 99]
+	assert iconv.remove_utf_string_with_bom([u8(0xFF), 0xFE, 97, 0, 98, 0, 99, 0], 'UTF-16LE') == [u8(97), 0, 98, 0, 99, 0]
+	assert iconv.remove_utf_string_with_bom([u8(0xFE), 0xFF, 0, 97, 0, 98, 0, 99], 'UTF-16BE') == [u8(0), 97, 0, 98, 0, 99]
+	assert iconv.remove_utf_string_with_bom([u8(0xFF), 0xFE, 0, 0, 97, 0, 0, 0, 98, 0, 0, 0, 99, 0, 0, 0], 'UTF-32LE') == [u8(97), 0, 0, 0, 98, 0, 0, 0, 99, 0, 0, 0]
+	assert iconv.remove_utf_string_with_bom([u8(0), 0, 0xFE, 0xFF, 0, 0, 0, 97, 0, 0, 0, 98, 0, 0, 0, 99], 'UTF-32BE') == [u8(0), 0, 0, 97, 0, 0, 0, 98, 0, 0, 0, 99]
+	// vfmt on
+}
+
+fn my_test_read_file_encoding_write_file_encoding(txt string, encoding string, bom bool, bytes []u8) ! {
+	iconv.write_file_encoding('iconv_tmp.txt', txt, encoding, bom)!
+	// read bytes directly from file
+	mut bytes_ref := os.read_file_array[u8]('iconv_tmp.txt')
+	assert bytes_ref == bytes
+	if bom {
+		bytes_ref = iconv.remove_utf_string_with_bom(bytes_ref, encoding)
+	}
+	str_ref := iconv.encoding_to_vstring(bytes_ref, encoding)!
+	assert str_ref.bytes() == txt.bytes()
+	str_conv := iconv.read_file_encoding('iconv_tmp.txt', encoding)!
+	assert str_conv == txt
+	os.rm('iconv_tmp.txt')!
+}
+
+fn test_read_file_encoding_write_file_encoding() ! {
+	// vfmt off
+	// UTF-8
+	my_test_read_file_encoding_write_file_encoding('V大法好abc','UTF-8',false,[u8(86), 229, 164, 167, 230, 179, 149, 229, 165, 189, 97, 98, 99])!
+	my_test_read_file_encoding_write_file_encoding('V大法好abc','UTF-8',true,[u8(0xEF), 0xBB, 0xBF, 86, 229, 164, 167, 230, 179, 149, 229, 165, 189, 97, 98, 99])!
+
+	// UTF-16LE
+	my_test_read_file_encoding_write_file_encoding('V大法好abc','UTF-16LE',false,[u8(86), 0, 39, 89, 213, 108, 125, 89, 97, 0, 98, 0, 99, 0])!
+	my_test_read_file_encoding_write_file_encoding('V大法好abc','UTF-16LE',true,[u8(0xFF), 0xFE, 86, 0, 39, 89, 213, 108, 125, 89, 97, 0, 98, 0, 99, 0])!
+
+	// UTF-16BE
+	my_test_read_file_encoding_write_file_encoding('V大法好abc','UTF-16BE',false,[u8(0), 86, 89, 39, 108, 213, 89, 125, 0, 97, 0, 98, 0, 99])!
+	my_test_read_file_encoding_write_file_encoding('V大法好abc','UTF-16BE',true,[u8(0xFE), 0xFF, 0, 86, 89, 39, 108, 213, 89, 125, 0, 97, 0, 98, 0, 99])!
+
+	// UTF-32LE
+	my_test_read_file_encoding_write_file_encoding('V大法好abc','UTF-32LE',false,[u8(86), 0, 0, 0, 39, 89, 0, 0, 213, 108, 0, 0, 125, 89, 0, 0, 97, 0, 0, 0, 98, 0, 0, 0, 99, 0, 0, 0])!
+	my_test_read_file_encoding_write_file_encoding('V大法好abc','UTF-32LE',true,[u8(0xFF), 0xFE, 0, 0, 86, 0, 0, 0, 39, 89, 0, 0, 213, 108, 0, 0, 125, 89, 0, 0, 97, 0, 0, 0, 98, 0, 0, 0, 99, 0, 0, 0])!
+
+	// UTF-32BE
+	my_test_read_file_encoding_write_file_encoding('V大法好abc','UTF-32BE',false,[u8(0), 0, 0, 86, 0, 0, 89, 39, 0, 0, 108, 213, 0, 0, 89, 125, 0, 0, 0, 97, 0, 0, 0, 98, 0, 0, 0, 99])!
+	my_test_read_file_encoding_write_file_encoding('V大法好abc','UTF-32BE',true,[u8(0), 0, 0xFE, 0xFF, 0, 0, 0, 86, 0, 0, 89, 39, 0, 0, 108, 213, 0, 0, 89, 125, 0, 0, 0, 97, 0, 0, 0, 98, 0, 0, 0, 99])!
+	// vfmt on
+}