From 05be6054a24a125f26adb3477efcdb8690b0fe41 Mon Sep 17 00:00:00 2001 From: chodak166 Date: Thu, 18 Dec 2025 18:31:12 +0100 Subject: [PATCH] WIP: lv_map tests --- example_dict.json | 12 -- lib/Cargo.toml | 7 +- lib/src/core/errors.rs | 8 + lib/src/core/sys_major/lvmap.rs | 279 +++++++++++++++++--------------- lib/src/core/traits.rs | 2 +- 5 files changed, 162 insertions(+), 146 deletions(-) delete mode 100644 example_dict.json diff --git a/example_dict.json b/example_dict.json deleted file mode 100644 index 81beb7a..0000000 --- a/example_dict.json +++ /dev/null @@ -1,12 +0,0 @@ -[ - {"word": "hello", "metadata": {"type": "greeting", "language": "english"}}, - {"word": "world", "metadata": {"type": "noun", "language": "english"}}, - {"word": "rust", "metadata": {"type": "programming_language", "paradigm": "systems"}}, - {"word": "programming", "metadata": {"type": "verb", "context": "computing"}}, - {"word": "database", "metadata": {"type": "noun", "context": "data_storage"}}, - {"word": "sqlite", "metadata": {"type": "database_engine", "features": ["embedded", "sql"]}}, - {"word": "json", "metadata": {"type": "data_format", "standard": "RFC 8259"}}, - {"word": "import", "metadata": {"type": "verb", "context": "data_operations"}}, - {"word": "dictionary", "metadata": {"type": "noun", "context": "reference"}}, - {"word": "example", "metadata": {"type": "noun", "usage": "demonstration"}} -] \ No newline at end of file diff --git a/lib/Cargo.toml b/lib/Cargo.toml index 3261a7e..66fc9e7 100644 --- a/lib/Cargo.toml +++ b/lib/Cargo.toml @@ -14,7 +14,10 @@ anyhow = "1.0" serde = { version = "1.0", features = ["derive"] } serde_json = "1.0" chrono = { version = "0.4", features = ["serde"] } -thiserror = "1.0" +thiserror = "2.0" async-trait = "0.1" parking_lot = "0.12" -sqlx = { version = "0.7", features = ["runtime-tokio", "sqlite", "chrono", "migrate"] } +sqlx = { version = "0.8.6", features = ["runtime-tokio", "sqlite", "chrono", "migrate"] } + +[dev-dependencies] +mockall = "0.14.0" diff --git a/lib/src/core/errors.rs b/lib/src/core/errors.rs index 1c9dbe5..a8da207 100644 --- a/lib/src/core/errors.rs +++ b/lib/src/core/errors.rs @@ -17,3 +17,11 @@ pub enum EncoderError { #[error("Unexpected error: {0}")] UnexpectedError(String), } + +#[derive(Error, Debug)] +pub enum DecoderError { + #[error("Decoder input error")] + InputError, + #[error("Unexpected error: {0}")] + UnexpectedError(String), +} diff --git a/lib/src/core/sys_major/lvmap.rs b/lib/src/core/sys_major/lvmap.rs index e418025..03bc938 100644 --- a/lib/src/core/sys_major/lvmap.rs +++ b/lib/src/core/sys_major/lvmap.rs @@ -1,6 +1,10 @@ use std::collections::HashMap; -use crate::core::{DictRepository, SystemEncoder}; +use anyhow::Error; + +use crate::core::errors::RepositoryError; +use crate::core::traits::DecodedValue; +use crate::core::{DictRepository, SystemDecoder}; // We store words by encoded number length, then encoded value // Example: @@ -15,6 +19,7 @@ use crate::core::{DictRepository, SystemEncoder}; // - 45: // - word: oral +// Words are fetched from DictRepository in batches const DEFAULT_DICT_BATCH_SIZE: usize = 100; type ValueLength = u8; @@ -23,35 +28,66 @@ pub type LenValueData = HashMap>>; #[derive(Debug)] pub struct LenValueMap { - map: LenValueData, - batch_size: usize, + data: LenValueData, } impl LenValueMap { pub fn new() -> Self { LenValueMap { - map: LenValueData::new(), - batch_size: DEFAULT_DICT_BATCH_SIZE, + data: LenValueData::new(), } } + pub fn with_data(mut self, data: LenValueData) -> Self { - self.map = data; + self.data = data; self } - pub fn with_batch_size(mut self, batch_size: usize) -> Self { - self.batch_size = batch_size; - self + pub fn data(&self) -> &LenValueData { + &self.data + } + + pub fn into_data(self) -> LenValueData { + self.data } - pub fn build(encoder: &E, repo: &R) -> Self { - let mut map = LenValueData::new(); + pub fn insert_words( + &mut self, + words: Vec, + decoder: &impl SystemDecoder, + ) -> Result<(), Error> { + for word in words { + if word.is_empty() { + continue; + } + let decoded = decoder.decode(&word); + let int_value = decoded.parse::()?; + let len = decoded.len() as u8; + + if let Some(len_item) = self.data.get_mut(&len) { + if let Some(value_item) = len_item.get_mut(&int_value) { + value_item.push(word); + } else { + len_item.insert(int_value, vec![word]); + } + } else { + self.data + .insert(len, HashMap::from([(int_value, vec![word])])); + } + } + Ok(()) + } - LenValueMap::new().with_data(map) + pub fn from_dict(decoder: &impl SystemDecoder, repo: &impl DictRepository) -> Self { + Self::build(decoder, repo, DEFAULT_DICT_BATCH_SIZE) // is that common approach? } pub fn is_empty(&self) -> bool { - self.map.is_empty() + self.data.is_empty() + } + + fn build(decoder: &impl SystemDecoder, repo: &impl DictRepository, batch_size: usize) -> Self { + todo!() } } @@ -61,137 +97,118 @@ mod tests { use crate::core::{entities::*, errors::*}; use std::collections::HashMap; - #[test] - fn test_no_inpout_then_empty_map() { - let repo = MockRepository::new().with_single_word_dict(); - let encoder = MockEncoder::new(EncodingResult { - input: "".to_string(), - output: vec![], - }); - let lv_map = LenValueMap::build(&encoder, &repo); - assert!(lv_map.is_empty()); + use mockall::automock; + use mockall::{mock, predicate::*}; + + const TEST_WORD_1: &str = "test_word_1"; + const TEST_WORD_2: &str = "test_word_2"; + const TEST_WORD_3: &str = "test_word_3"; + const TEST_WORD_4: &str = "test_word_4"; + const TEST_NUM_1: u64 = 12; + const TEST_NUM_2: u64 = 34; + const TEST_NUM_3: u64 = 9876; + const TEST_NUM_1_LEN: u8 = 2; + const TEST_NUM_3_LEN: u8 = 4; + + fn get_test_dec_map() -> HashMap { + HashMap::from([ + (TEST_WORD_1.to_string(), TEST_NUM_1.to_string()), + (TEST_WORD_2.to_string(), TEST_NUM_2.to_string()), + (TEST_WORD_3.to_string(), TEST_NUM_3.to_string()), + (TEST_WORD_4.to_string(), TEST_NUM_3.to_string()), + ]) } - // #[test] - // fn test_empty_dict_then_no_output() { - // let dict = Dict { - // name: "test".to_string(), - // entries: HashMap::new(), - // }; - // let repo = MockRepository::new(dict); - // let encoder = Encoder::new(&repo); - // let result = encoder.encode("test").unwrap(); - // assert!(result.output.is_empty()); - // } - - // #[test] - // fn test_encode_single_digit() { - // let dict = get_single_word_dict(); - // let repo = MockRepository::new(dict); - // let encoder = Encoder::new(&repo).with_batch_size(1); - // let result = encoder.encode("test").unwrap(); - // assert!(result.output.is_empty()); - // } - - // ---------- Helpers ---------- - - fn get_single_word_dict() -> Dict { - Dict { - name: "test_dict".to_string(), - entries: HashMap::from([( - 1, - DictEntry { - id: Some(1), - text: "test_word_1".to_string(), - metadata: HashMap::new(), - }, - )]), - } + fn mock_decoding(word: &str) -> DecodedValue { + get_test_dec_map().remove(word).unwrap() } - // ---------- Mocks ---------- - - struct MockEncoder { - result: EncodingResult, + fn get_test_words() -> Vec { + vec![ + TEST_WORD_1.to_string(), + TEST_WORD_2.to_string(), + TEST_WORD_3.to_string(), + TEST_WORD_4.to_string(), + ] } - impl MockEncoder { - fn new(result: EncodingResult) -> Self { - MockEncoder { result } + mock! { + pub Decoder {} + impl SystemDecoder for Decoder { + fn decode(&self, word: &str) -> DecodedValue; } } - impl SystemEncoder for MockEncoder { - fn initialize(&self) -> Result<(), EncoderError> { - Ok(()) - } - - fn encode(&self, _word: &str) -> Result { - Ok(self.result.clone()) - } - } + #[test] + fn test_insert_words_empty() { + let words = vec![]; + let mut decoder = MockDecoder::new(); + decoder.expect_decode().returning(|_| DecodedValue::new()); - struct MockRepository { - dict: Dict, + let mut lv_map = LenValueMap::new(); + lv_map.insert_words(words, &decoder).unwrap(); + assert!(lv_map.is_empty()); } - impl MockRepository { - pub fn new() -> Self { - MockRepository { - dict: Dict::new("test_dict".to_string()), - } - } - - pub fn with_single_word_dict(mut self) -> Self { - self.dict = get_single_word_dict(); - self - } + #[test] + fn test_single_word() { + let words = vec![TEST_WORD_1.to_string()]; + let mut decoder = MockDecoder::new(); + decoder + .expect_decode() + .returning(|word| mock_decoding(word)); + + let mut lv_map = LenValueMap::new(); + lv_map.insert_words(words, &decoder).unwrap(); + + let data = lv_map.into_data(); + + assert_eq!(data.len(), 1); + assert!(data.contains_key(&TEST_NUM_1_LEN)); + let data = data.get(&TEST_NUM_1_LEN).unwrap(); + assert!(data.contains_key(&TEST_NUM_1)); + let words = data.get(&TEST_NUM_1).unwrap(); + assert_eq!(words.len(), 1); + assert_eq!(words[0], TEST_WORD_1); } - #[async_trait::async_trait] - impl DictRepository for MockRepository { - async fn create_dict(&self, _name: &str) -> Result<(), RepositoryError> { - Ok(()) - } - - async fn save_entries( - &self, - _dict_name: &str, - _entries: &[DictEntry], - ) -> Result<(), RepositoryError> { - todo!() - } - - async fn fetch_many( - &self, - _name: &str, - limit: Option, - offset: Option, - ) -> Result { - let offset = offset.unwrap_or(0) as usize; - let limit = limit.unwrap_or(u32::MAX) as usize; - - let mut entries_vec: Vec<_> = self.dict.entries.iter().collect(); - entries_vec.sort_by_key(|&(id, _)| *id); - - let paginated = entries_vec.into_iter().skip(offset).take(limit); - let paginated_map: HashMap = paginated - .map(|(id, entry)| { - ( - *id, - DictEntry { - id: entry.id, - text: entry.text.clone(), - metadata: entry.metadata.clone(), - }, - ) - }) - .collect(); - - Ok(Dict { - name: self.dict.name.clone(), - entries: paginated_map, - }) - } + #[test] + fn test_multiple_words() { + let words = get_test_words(); + + let mut decoder = MockDecoder::new(); + decoder + .expect_decode() + .returning(|word| mock_decoding(word)); + + let mut lv_map = LenValueMap::new(); + lv_map.insert_words(words, &decoder).unwrap(); + + let data = lv_map.into_data(); + + assert_eq!(data.len(), 2); // two different lengths + assert!(data.contains_key(&TEST_NUM_1_LEN)); + assert!(data.contains_key(&TEST_NUM_3_LEN)); + let l2 = data.get(&TEST_NUM_1_LEN).unwrap(); + let l4 = data.get(&TEST_NUM_3_LEN).unwrap(); + + assert_eq!(l2.len(), 2); // two numbers + assert_eq!(l4.len(), 1); // one number + assert!(l2.contains_key(&TEST_NUM_1)); + assert!(l2.contains_key(&TEST_NUM_2)); + assert!(l4.contains_key(&TEST_NUM_3)); + + let words = l2.get(&TEST_NUM_1).unwrap(); + assert_eq!(words.len(), 1); + assert_eq!(words[0], TEST_WORD_1); + + let words = l2.get(&TEST_NUM_2).unwrap(); + assert_eq!(words.len(), 1); + assert_eq!(words[0], TEST_WORD_2); + + let words = l4.get(&TEST_NUM_3).unwrap(); + assert_eq!(words.len(), 2); + assert_eq!(words[0], TEST_WORD_3); + assert_eq!(words[1], TEST_WORD_4); } } diff --git a/lib/src/core/traits.rs b/lib/src/core/traits.rs index f06cbfa..a06d663 100644 --- a/lib/src/core/traits.rs +++ b/lib/src/core/traits.rs @@ -8,7 +8,7 @@ use super::errors::RepositoryError; /// but decoded as one number. For partial values, we can use /// u64, but for the whole decoded value that may be very long, /// we need a string. -type DecodedValue = String; +pub type DecodedValue = String; pub trait SystemDecoder { fn decode(&self, word: &str) -> DecodedValue;