Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

optimize search #4

Merged
merged 2 commits into from
Sep 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "region-cn"
version = "0.1.10"
version = "0.1.11"
edition = "2021"
authors = ["bujnlc8 <75124771@qq.com>"]
description = "A lib to search chinese region by 6-digit administrative code with trier or search in place."
Expand Down
4 changes: 2 additions & 2 deletions data/region.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ def pack(self, data_list: list[tuple[str, str, str]] | list[tuple[str, str]], ve
f.write(version.to_bytes(length=4))
# 先跳过偏移
f.seek(6)
# 写数据, code [i:3] type [i: 1] region [c: n] \n
# 写数据
index_offset = 6
offset_map: dict[int, int] = {}
chars = set()
Expand Down Expand Up @@ -239,14 +239,14 @@ def search(self, region_code: str) -> tuple[str, list[str]]:
if len(region_code) != 6:
raise ValueError('地区编码必须为6位')
with open(self.file_name, 'rb') as f:
# version = int.from_bytes(f.read(4), byteorder="big")
# 跳过版本号
f.seek(4)
index_offset = int.from_bytes(f.read(2), byteorder='big')
# 查找 索引区
code_2 = int(region_code[:2])
f.seek(index_offset)
offset = 0
# 由于省份是固定的,可对34个省份的偏移生成一个映射表
for _ in range(0, 34):
combine_bytes = f.read(3)
if not combine_bytes:
Expand Down
16 changes: 16 additions & 0 deletions examples/region.rs
Original file line number Diff line number Diff line change
Expand Up @@ -47,4 +47,20 @@ pub fn main() {
assert_eq!(result.discard_year, discard_year.parse::<u32>().unwrap());
}
}
// 测试最新数据
let mut origin_file = File::open(PathBuf::from("data/region.txt")).unwrap();
let mut file_string = String::new();
origin_file.read_to_string(&mut file_string).unwrap();
let mut searcher = Region::new(PathBuf::from("data/region.dat"));
let json_data: Value = serde_json::from_str(&file_string).unwrap();
for x in json_data.as_array().unwrap() {
let code = x.get(0).unwrap().as_str().unwrap();
let region_name = x.get(1).unwrap().as_str().unwrap().replace("*", "");
let result = searcher.search_with_trie(code).unwrap();
println!("{code} {region_name} => {}", result.name);
assert!(result.name.ends_with(&region_name));
let result = searcher.search_with_data(code).unwrap();
println!("{code} {region_name} => {}", result.name);
assert!(result.name.ends_with(&region_name));
}
}
53 changes: 30 additions & 23 deletions src/region.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,11 @@ use crate::{be_u8_slice_to_i32, decode_u8_list, trie::RegionTrie, RegionError, R
pub struct Region {
file_path: PathBuf,
version: String,
offset_index: i32,
offset_index: u64,
region_trier: Option<Rc<RefCell<RegionTrie>>>,
char_map: Rc<RefCell<HashMap<usize, char>>>,
file: Rc<RefCell<File>>,
index_offset_map: HashMap<i32, u64>,
}

impl Default for RegionTrie {
Expand All @@ -29,16 +30,27 @@ impl Default for RegionTrie {
}
}

// 省份前2位
const PROVINCE_CODES: [i32; 34] = [
11, 12, 13, 14, 15, 21, 22, 23, 31, 32, 33, 34, 35, 36, 37, 41, 42, 43, 44, 45, 46, 50, 51, 52,
53, 54, 61, 62, 63, 64, 65, 71, 81, 82,
];

impl Region {
pub fn new(file_path: PathBuf) -> Self {
let file = File::open(&file_path).unwrap();
let mut index_offset_map = HashMap::new();
for (i, v) in PROVINCE_CODES.iter().enumerate() {
index_offset_map.insert(*v, (i * 3) as u64);
}
Self {
file_path,
version: String::new(),
offset_index: 0,
region_trier: None,
char_map: Rc::new(RefCell::new(HashMap::new())),
file: Rc::new(RefCell::new(file)),
index_offset_map,
}
}

Expand Down Expand Up @@ -70,9 +82,7 @@ impl Region {
if char_map_ref.is_empty() {
// 读取字符集
file_ref
.seek(std::io::SeekFrom::Start(
(self.offset_index + 34 * 3) as u64,
))
.seek(std::io::SeekFrom::Start(self.offset_index + 34 * 3))
.map_err(RegionError::IOError)?;
// gbk编码
let mut char_bytes = Vec::new();
Expand Down Expand Up @@ -101,7 +111,7 @@ impl Region {
let mut index_offset: [u8; 2] = [0; 2];
file.read_exact(&mut index_offset)
.map_err(RegionError::IOError)?;
self.offset_index = be_u8_slice_to_i32(&index_offset);
self.offset_index = be_u8_slice_to_i32(&index_offset) as u64;
} else {
file.seek(std::io::SeekFrom::Start(6))
.map_err(RegionError::IOError)?;
Expand Down Expand Up @@ -198,34 +208,31 @@ impl Region {
let mut index_offset: [u8; 2] = [0; 2];
file.read_exact(&mut index_offset)
.map_err(RegionError::IOError)?;
self.offset_index = be_u8_slice_to_i32(&index_offset);
self.offset_index = be_u8_slice_to_i32(&index_offset) as u64;
}
// 读取字符集
self.get_char_map(&mut file)?;
// 查找索引区
file.seek(std::io::SeekFrom::Start(self.offset_index as u64))
file.seek(std::io::SeekFrom::Start(self.offset_index))
.map_err(RegionError::IOError)?;
let mut region_code_offset: [u8; 3] = [0u8; 3];
let region_code_int: i32 = region_code.parse().map_err(RegionError::ParseError)?;
let mut offset = 0;
for _ in 0..34 {
let amount = file
.read(&mut region_code_offset)
.map_err(RegionError::IOError)?;
if amount == 0 {
break;
// region_code 前2位
let code_2_int = region_code_int / 10000;
match self.index_offset_map.get(&code_2_int) {
Some(v) => {
file.seek(std::io::SeekFrom::Start(self.offset_index + (*v)))
.map_err(RegionError::IOError)?;
}
// 高7位表示code的前2位,后17位表示偏移
let combine = be_u8_slice_to_i32(&region_code_offset);
let code_2 = combine >> 17;
if code_2 == region_code_int / 10000 {
offset = combine - (code_2 << 17);
break;
None => {
return Err(RegionError::Message("cannot find record".to_string()));
}
}
if offset == 0 {
return Err(RegionError::Message("cannot find record".to_string()));
}
let _ = file
.read(&mut region_code_offset)
.map_err(RegionError::IOError)?;
let combine = be_u8_slice_to_i32(&region_code_offset);
let offset = combine - (code_2_int << 17);
file.seek(std::io::SeekFrom::Start(offset as u64))
.map_err(RegionError::IOError)?;
let mut province_record: [u8; 4000] = [0u8; 4000];
Expand Down
Loading