From 05be6054a24a125f26adb3477efcdb8690b0fe41 Mon Sep 17 00:00:00 2001
From: chodak166 <chodak166@op.pl>
Date: Thu, 18 Dec 2025 18:31:12 +0100
Subject: [PATCH] WIP: lv_map tests

---
 example_dict.json               |  12 --
 lib/Cargo.toml                  |   7 +-
 lib/src/core/errors.rs          |   8 +
 lib/src/core/sys_major/lvmap.rs | 279 +++++++++++++++++---------------
 lib/src/core/traits.rs          |   2 +-
 5 files changed, 162 insertions(+), 146 deletions(-)
 delete mode 100644 example_dict.json
diff --git a/example_dict.json b/example_dict.json
deleted file mode 100644
index 81beb7a..0000000
--- a/example_dict.json
+++ /dev/null
@@ -1,12 +0,0 @@
-[
-  {"word": "hello", "metadata": {"type": "greeting", "language": "english"}},
-  {"word": "world", "metadata": {"type": "noun", "language": "english"}},
-  {"word": "rust", "metadata": {"type": "programming_language", "paradigm": "systems"}},
-  {"word": "programming", "metadata": {"type": "verb", "context": "computing"}},
-  {"word": "database", "metadata": {"type": "noun", "context": "data_storage"}},
-  {"word": "sqlite", "metadata": {"type": "database_engine", "features": ["embedded", "sql"]}},
-  {"word": "json", "metadata": {"type": "data_format", "standard": "RFC 8259"}},
-  {"word": "import", "metadata": {"type": "verb", "context": "data_operations"}},
-  {"word": "dictionary", "metadata": {"type": "noun", "context": "reference"}},
-  {"word": "example", "metadata": {"type": "noun", "usage": "demonstration"}}
-]
\ No newline at end of file
diff --git a/lib/Cargo.toml b/lib/Cargo.toml
index 3261a7e..66fc9e7 100644
--- a/lib/Cargo.toml
+++ b/lib/Cargo.toml
@@ -14,7 +14,10 @@ anyhow = "1.0"
 serde = { version = "1.0", features = ["derive"] }
 serde_json = "1.0"
 chrono = { version = "0.4", features = ["serde"] }
-thiserror = "1.0"
+thiserror = "2.0"
 async-trait = "0.1"
 parking_lot = "0.12"
-sqlx = { version = "0.7", features = ["runtime-tokio", "sqlite", "chrono", "migrate"] }
+sqlx = { version = "0.8.6", features = ["runtime-tokio", "sqlite", "chrono", "migrate"] }
+
+[dev-dependencies]
+mockall = "0.14.0"
diff --git a/lib/src/core/errors.rs b/lib/src/core/errors.rs
index 1c9dbe5..a8da207 100644
--- a/lib/src/core/errors.rs
+++ b/lib/src/core/errors.rs
@@ -17,3 +17,11 @@ pub enum EncoderError {
     #[error("Unexpected error: {0}")]
     UnexpectedError(String),
 }
+
+#[derive(Error, Debug)]
+pub enum DecoderError {
+    #[error("Decoder input error")]
+    InputError,
+    #[error("Unexpected error: {0}")]
+    UnexpectedError(String),
+}
diff --git a/lib/src/core/sys_major/lvmap.rs b/lib/src/core/sys_major/lvmap.rs
index e418025..03bc938 100644
--- a/lib/src/core/sys_major/lvmap.rs
+++ b/lib/src/core/sys_major/lvmap.rs
@@ -1,6 +1,10 @@
 use std::collections::HashMap;
 
-use crate::core::{DictRepository, SystemEncoder};
+use anyhow::Error;
+
+use crate::core::errors::RepositoryError;
+use crate::core::traits::DecodedValue;
+use crate::core::{DictRepository, SystemDecoder};
 
 // We store words by encoded number length, then encoded value
 // Example:
@@ -15,6 +19,7 @@ use crate::core::{DictRepository, SystemEncoder};
 //     - 45:
 //       - word: oral
 
+// Words are fetched from DictRepository in batches
 const DEFAULT_DICT_BATCH_SIZE: usize = 100;
 
 type ValueLength = u8;
@@ -23,35 +28,66 @@ pub type LenValueData = HashMap<ValueLength, HashMap<Value, Vec<String>>>;
 
 #[derive(Debug)]
 pub struct LenValueMap {
-    map: LenValueData,
-    batch_size: usize,
+    data: LenValueData,
 }
 
 impl LenValueMap {
     pub fn new() -> Self {
         LenValueMap {
-            map: LenValueData::new(),
-            batch_size: DEFAULT_DICT_BATCH_SIZE,
+            data: LenValueData::new(),
         }
     }
+
     pub fn with_data(mut self, data: LenValueData) -> Self {
-        self.map = data;
+        self.data = data;
         self
     }
 
-    pub fn with_batch_size(mut self, batch_size: usize) -> Self {
-        self.batch_size = batch_size;
-        self
+    pub fn data(&self) -> &LenValueData {
+        &self.data
+    }
+
+    pub fn into_data(self) -> LenValueData {
+        self.data
     }
 
-    pub fn build<E: SystemEncoder, R: DictRepository>(encoder: &E, repo: &R) -> Self {
-        let mut map = LenValueData::new();
+    pub fn insert_words(
+        &mut self,
+        words: Vec<String>,
+        decoder: &impl SystemDecoder,
+    ) -> Result<(), Error> {
+        for word in words {
+            if word.is_empty() {
+                continue;
+            }
+            let decoded = decoder.decode(&word);
+            let int_value = decoded.parse::<u64>()?;
+            let len = decoded.len() as u8;
+
+            if let Some(len_item) = self.data.get_mut(&len) {
+                if let Some(value_item) = len_item.get_mut(&int_value) {
+                    value_item.push(word);
+                } else {
+                    len_item.insert(int_value, vec![word]);
+                }
+            } else {
+                self.data
+                    .insert(len, HashMap::from([(int_value, vec![word])]));
+            }
+        }
+        Ok(())
+    }
 
-        LenValueMap::new().with_data(map)
+    pub fn from_dict(decoder: &impl SystemDecoder, repo: &impl DictRepository) -> Self {
+        Self::build(decoder, repo, DEFAULT_DICT_BATCH_SIZE) // is that common approach?
     }
 
     pub fn is_empty(&self) -> bool {
-        self.map.is_empty()
+        self.data.is_empty()
+    }
+
+    fn build(decoder: &impl SystemDecoder, repo: &impl DictRepository, batch_size: usize) -> Self {
+        todo!()
     }
 }
 
@@ -61,137 +97,118 @@ mod tests {
     use crate::core::{entities::*, errors::*};
     use std::collections::HashMap;
 
-    #[test]
-    fn test_no_inpout_then_empty_map() {
-        let repo = MockRepository::new().with_single_word_dict();
-        let encoder = MockEncoder::new(EncodingResult {
-            input: "".to_string(),
-            output: vec![],
-        });
-        let lv_map = LenValueMap::build(&encoder, &repo);
-        assert!(lv_map.is_empty());
+    use mockall::automock;
+    use mockall::{mock, predicate::*};
+
+    const TEST_WORD_1: &str = "test_word_1";
+    const TEST_WORD_2: &str = "test_word_2";
+    const TEST_WORD_3: &str = "test_word_3";
+    const TEST_WORD_4: &str = "test_word_4";
+    const TEST_NUM_1: u64 = 12;
+    const TEST_NUM_2: u64 = 34;
+    const TEST_NUM_3: u64 = 9876;
+    const TEST_NUM_1_LEN: u8 = 2;
+    const TEST_NUM_3_LEN: u8 = 4;
+
+    fn get_test_dec_map() -> HashMap<String, DecodedValue> {
+        HashMap::from([
+            (TEST_WORD_1.to_string(), TEST_NUM_1.to_string()),
+            (TEST_WORD_2.to_string(), TEST_NUM_2.to_string()),
+            (TEST_WORD_3.to_string(), TEST_NUM_3.to_string()),
+            (TEST_WORD_4.to_string(), TEST_NUM_3.to_string()),
+        ])
     }
 
-    // #[test]
-    // fn test_empty_dict_then_no_output() {
-    //     let dict = Dict {
-    //         name: "test".to_string(),
-    //         entries: HashMap::new(),
-    //     };
-    //     let repo = MockRepository::new(dict);
-    //     let encoder = Encoder::new(&repo);
-    //     let result = encoder.encode("test").unwrap();
-    //     assert!(result.output.is_empty());
-    // }
-
-    // #[test]
-    // fn test_encode_single_digit() {
-    //     let dict = get_single_word_dict();
-    //     let repo = MockRepository::new(dict);
-    //     let encoder = Encoder::new(&repo).with_batch_size(1);
-    //     let result = encoder.encode("test").unwrap();
-    //     assert!(result.output.is_empty());
-    // }
-
-    // ---------- Helpers ----------
-
-    fn get_single_word_dict() -> Dict {
-        Dict {
-            name: "test_dict".to_string(),
-            entries: HashMap::from([(
-                1,
-                DictEntry {
-                    id: Some(1),
-                    text: "test_word_1".to_string(),
-                    metadata: HashMap::new(),
-                },
-            )]),
-        }
+    fn mock_decoding(word: &str) -> DecodedValue {
+        get_test_dec_map().remove(word).unwrap()
     }
 
-    // ---------- Mocks ----------
-
-    struct MockEncoder {
-        result: EncodingResult,
+    fn get_test_words() -> Vec<String> {
+        vec![
+            TEST_WORD_1.to_string(),
+            TEST_WORD_2.to_string(),
+            TEST_WORD_3.to_string(),
+            TEST_WORD_4.to_string(),
+        ]
     }
 
-    impl MockEncoder {
-        fn new(result: EncodingResult) -> Self {
-            MockEncoder { result }
+    mock! {
+        pub Decoder {}
+        impl SystemDecoder for Decoder {
+            fn decode(&self, word: &str) -> DecodedValue;
         }
     }
 
-    impl SystemEncoder for MockEncoder {
-        fn initialize(&self) -> Result<(), EncoderError> {
-            Ok(())
-        }
-
-        fn encode(&self, _word: &str) -> Result<EncodingResult, EncoderError> {
-            Ok(self.result.clone())
-        }
-    }
+    #[test]
+    fn test_insert_words_empty() {
+        let words = vec![];
+        let mut decoder = MockDecoder::new();
+        decoder.expect_decode().returning(|_| DecodedValue::new());
 
-    struct MockRepository {
-        dict: Dict,
+        let mut lv_map = LenValueMap::new();
+        lv_map.insert_words(words, &decoder).unwrap();
+        assert!(lv_map.is_empty());
     }
 
-    impl MockRepository {
-        pub fn new() -> Self {
-            MockRepository {
-                dict: Dict::new("test_dict".to_string()),
-            }
-        }
-
-        pub fn with_single_word_dict(mut self) -> Self {
-            self.dict = get_single_word_dict();
-            self
-        }
+    #[test]
+    fn test_single_word() {
+        let words = vec![TEST_WORD_1.to_string()];
+        let mut decoder = MockDecoder::new();
+        decoder
+            .expect_decode()
+            .returning(|word| mock_decoding(word));
+
+        let mut lv_map = LenValueMap::new();
+        lv_map.insert_words(words, &decoder).unwrap();
+
+        let data = lv_map.into_data();
+
+        assert_eq!(data.len(), 1);
+        assert!(data.contains_key(&TEST_NUM_1_LEN));
+        let data = data.get(&TEST_NUM_1_LEN).unwrap();
+        assert!(data.contains_key(&TEST_NUM_1));
+        let words = data.get(&TEST_NUM_1).unwrap();
+        assert_eq!(words.len(), 1);
+        assert_eq!(words[0], TEST_WORD_1);
     }
 
-    #[async_trait::async_trait]
-    impl DictRepository for MockRepository {
-        async fn create_dict(&self, _name: &str) -> Result<(), RepositoryError> {
-            Ok(())
-        }
-
-        async fn save_entries(
-            &self,
-            _dict_name: &str,
-            _entries: &[DictEntry],
-        ) -> Result<(), RepositoryError> {
-            todo!()
-        }
-
-        async fn fetch_many(
-            &self,
-            _name: &str,
-            limit: Option<u32>,
-            offset: Option<u32>,
-        ) -> Result<Dict, RepositoryError> {
-            let offset = offset.unwrap_or(0) as usize;
-            let limit = limit.unwrap_or(u32::MAX) as usize;
-
-            let mut entries_vec: Vec<_> = self.dict.entries.iter().collect();
-            entries_vec.sort_by_key(|&(id, _)| *id);
-
-            let paginated = entries_vec.into_iter().skip(offset).take(limit);
-            let paginated_map: HashMap<u64, DictEntry> = paginated
-                .map(|(id, entry)| {
-                    (
-                        *id,
-                        DictEntry {
-                            id: entry.id,
-                            text: entry.text.clone(),
-                            metadata: entry.metadata.clone(),
-                        },
-                    )
-                })
-                .collect();
-
-            Ok(Dict {
-                name: self.dict.name.clone(),
-                entries: paginated_map,
-            })
-        }
+    #[test]
+    fn test_multiple_words() {
+        let words = get_test_words();
+
+        let mut decoder = MockDecoder::new();
+        decoder
+            .expect_decode()
+            .returning(|word| mock_decoding(word));
+
+        let mut lv_map = LenValueMap::new();
+        lv_map.insert_words(words, &decoder).unwrap();
+
+        let data = lv_map.into_data();
+
+        assert_eq!(data.len(), 2); // two different lengths
+        assert!(data.contains_key(&TEST_NUM_1_LEN));
+        assert!(data.contains_key(&TEST_NUM_3_LEN));
+        let l2 = data.get(&TEST_NUM_1_LEN).unwrap();
+        let l4 = data.get(&TEST_NUM_3_LEN).unwrap();
+
+        assert_eq!(l2.len(), 2); // two numbers
+        assert_eq!(l4.len(), 1); // one number
+        assert!(l2.contains_key(&TEST_NUM_1));
+        assert!(l2.contains_key(&TEST_NUM_2));
+        assert!(l4.contains_key(&TEST_NUM_3));
+
+        let words = l2.get(&TEST_NUM_1).unwrap();
+        assert_eq!(words.len(), 1);
+        assert_eq!(words[0], TEST_WORD_1);
+
+        let words = l2.get(&TEST_NUM_2).unwrap();
+        assert_eq!(words.len(), 1);
+        assert_eq!(words[0], TEST_WORD_2);
+
+        let words = l4.get(&TEST_NUM_3).unwrap();
+        assert_eq!(words.len(), 2);
+        assert_eq!(words[0], TEST_WORD_3);
+        assert_eq!(words[1], TEST_WORD_4);
     }
 }
diff --git a/lib/src/core/traits.rs b/lib/src/core/traits.rs
index f06cbfa..a06d663 100644
--- a/lib/src/core/traits.rs
+++ b/lib/src/core/traits.rs
@@ -8,7 +8,7 @@ use super::errors::RepositoryError;
 /// but decoded as one number. For partial values, we can use
 /// u64, but for the whole decoded value that may be very long,
 /// we need a string.
-type DecodedValue = String;
+pub type DecodedValue = String;
 
 pub trait SystemDecoder {
     fn decode(&self, word: &str) -> DecodedValue;