From 6189b4318bb87297c3e4cc27bb1dd4cdeb705b15 Mon Sep 17 00:00:00 2001 From: chodak166 Date: Sun, 21 Dec 2025 12:55:55 +0100 Subject: [PATCH] WIP: lv_map refactor & tests --- lib/src/core/entities.rs | 96 +++++++++++++--- lib/src/core/errors.rs | 28 +++-- lib/src/core/sys_major/decoder.rs | 7 +- lib/src/core/sys_major/decoder_tests.rs | 26 ++--- lib/src/core/sys_major/encoder.rs | 8 +- lib/src/core/sys_major/lvmap.rs | 115 +++++++++----------- lib/src/core/sys_major/rules_pl.rs | 24 ++-- lib/src/core/traits.rs | 18 +-- lib/src/presentation/cli/commands/decode.rs | 4 +- 9 files changed, 184 insertions(+), 142 deletions(-) diff --git a/lib/src/core/entities.rs b/lib/src/core/entities.rs index f670357..141284b 100644 --- a/lib/src/core/entities.rs +++ b/lib/src/core/entities.rs @@ -1,4 +1,81 @@ -use std::collections::HashMap; +use super::errors::CodecError; +use std::num::ParseIntError; +use std::{collections::HashMap, u64}; + +/// A number encoded as a sequence of words +#[derive(Debug, Clone)] +pub struct EncodedValue(Vec); + +/// The number value can be encoded as many word sets, +/// but decoded as one number. For partial values +/// and dictionary words (reasonable length), we can use +/// u64 (20-digit number), but the whole input text can +/// be longer than 20 digits, so we operate on String (<= 255). +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct DecodedValue(String); + +impl DecodedValue { + pub fn new(value: String) -> Result { + if value.len() > u8::MAX as usize { + Err(CodecError::TextTooLong(value.len())) + } else { + Ok(Self(value)) + } + } + + pub fn as_str(&self) -> &str { + &self.0 + } + + pub fn parse(&self) -> Result { + self.0.parse() + } + + pub fn len(&self) -> usize { + self.0.len() + } + + pub fn value_len(&self) -> Result { + if self.len() == 0 { + return Err(CodecError::EmptyValue); + } + DecodedLength::try_from(self.len()) + } +} + +impl PartialEq<&str> for DecodedValue { + fn eq(&self, other: &&str) -> bool { + &self.0 == *other + } +} + +impl PartialEq for &str { + fn eq(&self, other: &DecodedValue) -> bool { + *self == &other.0 + } +} + +#[derive(Debug, Clone, Copy, Eq, PartialEq, Hash)] +pub struct DecodedLength(u8); + +impl DecodedLength { + pub const fn from(value: u8) -> Self { + Self(value) + } +} + +impl TryFrom for DecodedLength { + type Error = CodecError; + fn try_from(value: usize) -> Result { + if value > u8::MAX as usize { + Err(CodecError::ValueLimitExceeded(value)) + } else { + Ok(Self(value as u8)) + } + } +} + +// --- Dictionary --- pub type DictEntryId = u64; @@ -37,20 +114,3 @@ impl Dict { self.entries.insert(entry.id.unwrap(), entry); } } - -#[derive(Debug, Clone)] -pub struct EncodedItem { - pub entry: DictEntry, -} - -impl EncodedItem { - pub fn value(&self) -> &str { - &self.entry.text - } -} - -#[derive(Debug, Clone)] -pub struct EncodingResult { - pub input: String, - pub output: Vec, -} diff --git a/lib/src/core/errors.rs b/lib/src/core/errors.rs index a8da207..7f05a37 100644 --- a/lib/src/core/errors.rs +++ b/lib/src/core/errors.rs @@ -4,24 +4,28 @@ use thiserror::Error; pub enum RepositoryError { #[error("Data source connection failed")] ConnectionFailed, - #[error("Dictionary '{0}' not found")] + + #[error("'{0}' not found")] NotFound(String), + #[error("Storage error: {0}")] StorageError(String), } -#[derive(Error, Debug)] -pub enum EncoderError { - #[error("Encoder initialization failed")] +#[derive(Debug, Error)] +pub enum CodecError { + #[error("text too long: {0} bytes")] + TextTooLong(usize), + + #[error("value too large: {0}/255")] + ValueLimitExceeded(usize), + + #[error("operation not allowed on empty value")] + EmptyValue, + + #[error("initialization failed")] InitializationFailed, - #[error("Unexpected error: {0}")] - UnexpectedError(String), -} -#[derive(Error, Debug)] -pub enum DecoderError { - #[error("Decoder input error")] - InputError, - #[error("Unexpected error: {0}")] + #[error("unexpected error: {0}")] UnexpectedError(String), } diff --git a/lib/src/core/sys_major/decoder.rs b/lib/src/core/sys_major/decoder.rs index e60dd91..b7ce7c6 100644 --- a/lib/src/core/sys_major/decoder.rs +++ b/lib/src/core/sys_major/decoder.rs @@ -1,4 +1,4 @@ -use crate::core::traits::SystemDecoder; +use crate::core::{entities::DecodedValue, errors::CodecError, traits::SystemDecoder}; #[derive(Debug, Default, Clone)] pub struct Rule { @@ -108,7 +108,7 @@ impl Decoder { } impl SystemDecoder for Decoder { - fn decode(&self, word: &str) -> String { + fn decode(&self, word: &str) -> Result { let mut matches: RuleMatches = self .rules .iter() @@ -117,6 +117,7 @@ impl SystemDecoder for Decoder { matches.sort_by_key(|&(pos, _)| pos); dbg!(&matches); - matches.into_iter().map(|(_, value)| value).collect() + let num_str: String = matches.into_iter().map(|(_, value)| value).collect(); + DecodedValue::new(num_str) } } diff --git a/lib/src/core/sys_major/decoder_tests.rs b/lib/src/core/sys_major/decoder_tests.rs index 8410351..2c79050 100644 --- a/lib/src/core/sys_major/decoder_tests.rs +++ b/lib/src/core/sys_major/decoder_tests.rs @@ -45,90 +45,90 @@ mod tests { #[test] fn test_single_symbol_encoding_only_before_only_after_matched() { let decoder = Decoder::new(create_single_rules()); - let output = decoder.decode("ABC"); + let output = decoder.decode("ABC").unwrap(); assert_eq!(output, "2") } #[test] fn test_double_symbol_encoding_only_before_only_after_matched() { let decoder = Decoder::new(create_double_rules()); - let output = decoder.decode("ABCDEF"); + let output = decoder.decode("ABCDEF").unwrap(); assert_eq!(output, "2") } #[test] fn test_single_symbol_encoding_only_before_not_matched_with_other() { let decoder = Decoder::new(create_single_rules()); - let output = decoder.decode("DBC"); + let output = decoder.decode("DBC").unwrap(); assert_eq!(output, "") } #[test] fn test_double_symbol_encoding_only_before_not_matched_with_other() { let decoder = Decoder::new(create_double_rules()); - let output = decoder.decode("AACDEE"); + let output = decoder.decode("AACDEE").unwrap(); assert_eq!(output, "") } #[test] fn test_case_insensitivity() { let decoder = Decoder::new(create_double_rules()); - let output = decoder.decode("abcdef"); + let output = decoder.decode("abcdef").unwrap(); assert_eq!(output, "2") } #[test] fn test_single_symbol_encoding_only_before_not_matched_with_empty() { let decoder = Decoder::new(create_single_rules()); - let output = decoder.decode("BC"); + let output = decoder.decode("BC").unwrap(); assert_eq!(output, "") } #[test] fn test_single_symbol_encoding_only_before_not_matched_with_not_before() { let decoder = Decoder::new(create_single_rules()); - let output = decoder.decode("XBC"); + let output = decoder.decode("XBC").unwrap(); assert_eq!(output, "") } #[test] fn test_single_symbol_encoding_only_after_not_matched_with_other() { let decoder = Decoder::new(create_single_rules()); - let output = decoder.decode("ABD"); + let output = decoder.decode("ABD").unwrap(); assert_eq!(output, "") } #[test] fn test_single_symbol_encoding_only_after_not_matched_with_empty() { let decoder = Decoder::new(create_single_rules()); - let output = decoder.decode("AB"); + let output = decoder.decode("AB").unwrap(); assert_eq!(output, "") } #[test] fn test_single_symbol_encoding_only_after_not_matched_with_not_after() { let decoder = Decoder::new(create_single_rules()); - let output = decoder.decode("ABY"); + let output = decoder.decode("ABY").unwrap(); assert_eq!(output, "") } #[test] fn test_single_symbol_encoding_empty_before_after_matched_with_empty() { let decoder = Decoder::new(create_single_rules_min()); - let output = decoder.decode("B"); + let output = decoder.decode("B").unwrap(); assert_eq!(output, "2") } #[test] fn test_single_symbol_encoding_empty_before_after_matched_with_others() { let decoder = Decoder::new(create_single_rules_min()); - let output = decoder.decode("AXBYC"); + let output = decoder.decode("AXBYC").unwrap(); assert_eq!(output, "2") } #[test] fn test_encoding_multiple_phonemes() { let decoder = Decoder::new(create_double_rules()); - let output = decoder.decode("VvmNabCd33mn00CD22cdefmn"); + let output = decoder.decode("VvmNabCd33mn00CD22cdefmn").unwrap(); assert_eq!(output, "32323") } } diff --git a/lib/src/core/sys_major/encoder.rs b/lib/src/core/sys_major/encoder.rs index 2ff07a4..fa1c0ef 100644 --- a/lib/src/core/sys_major/encoder.rs +++ b/lib/src/core/sys_major/encoder.rs @@ -1,6 +1,4 @@ -use crate::core::{ - entities::EncodingResult, errors::EncoderError, sys_major::LenValueMap, traits::*, -}; +use crate::core::{entities::EncodedValue, errors::CodecError, sys_major::LenValueMap, traits::*}; #[derive(Debug)] pub struct Encoder { @@ -14,10 +12,10 @@ impl Encoder { } impl SystemEncoder for Encoder { - fn initialize(&self) -> Result<(), EncoderError> { + fn initialize(&self) -> Result<(), CodecError> { Ok(()) } - fn encode(&self, word: &str) -> Result { + fn encode(&self, word: &str) -> Result { todo!() } } diff --git a/lib/src/core/sys_major/lvmap.rs b/lib/src/core/sys_major/lvmap.rs index 03bc938..74ace5c 100644 --- a/lib/src/core/sys_major/lvmap.rs +++ b/lib/src/core/sys_major/lvmap.rs @@ -1,10 +1,6 @@ -use std::collections::HashMap; - -use anyhow::Error; - -use crate::core::errors::RepositoryError; -use crate::core::traits::DecodedValue; -use crate::core::{DictRepository, SystemDecoder}; +use crate::core::{DictRepository, SystemDecoder, entities::DecodedLength, errors::CodecError}; +use std::{collections::HashMap, num::ParseIntError}; +use thiserror::Error; // We store words by encoded number length, then encoded value // Example: @@ -22,68 +18,62 @@ use crate::core::{DictRepository, SystemDecoder}; // Words are fetched from DictRepository in batches const DEFAULT_DICT_BATCH_SIZE: usize = 100; -type ValueLength = u8; -type Value = u64; -pub type LenValueData = HashMap>>; +#[derive(Error, Debug)] +pub enum LenValueMapError { + #[error("Value parsing error: {0}")] + Parse(#[from] ParseIntError), + + #[error(transparent)] + Codec(#[from] CodecError), +} + +type DecodedNumber = u64; +pub type LenValueData = HashMap>>; -#[derive(Debug)] +#[derive(Debug, Default, Clone)] pub struct LenValueMap { data: LenValueData, } impl LenValueMap { pub fn new() -> Self { - LenValueMap { - data: LenValueData::new(), - } + Self::default() } - pub fn with_data(mut self, data: LenValueData) -> Self { - self.data = data; - self - } - - pub fn data(&self) -> &LenValueData { - &self.data + pub fn is_empty(&self) -> bool { + self.data.is_empty() } pub fn into_data(self) -> LenValueData { self.data } - pub fn insert_words( + pub fn insert_words( &mut self, - words: Vec, - decoder: &impl SystemDecoder, - ) -> Result<(), Error> { + words: I, + decoder: &dyn SystemDecoder, + ) -> Result<(), LenValueMapError> + where + I: IntoIterator, + { for word in words { if word.is_empty() { continue; } - let decoded = decoder.decode(&word); - let int_value = decoded.parse::()?; - let len = decoded.len() as u8; - - if let Some(len_item) = self.data.get_mut(&len) { - if let Some(value_item) = len_item.get_mut(&int_value) { - value_item.push(word); - } else { - len_item.insert(int_value, vec![word]); - } - } else { - self.data - .insert(len, HashMap::from([(int_value, vec![word])])); - } + let decoded = decoder.decode(&word)?; + + self.data + .entry(decoded.value_len()?) + .or_default() + .entry(decoded.parse()?) + .or_default() + .push(word); } Ok(()) } - pub fn from_dict(decoder: &impl SystemDecoder, repo: &impl DictRepository) -> Self { - Self::build(decoder, repo, DEFAULT_DICT_BATCH_SIZE) // is that common approach? - } - - pub fn is_empty(&self) -> bool { - self.data.is_empty() + pub async fn from_dict(decoder: &impl SystemDecoder, repo: &impl DictRepository) -> Self { + Self::build(decoder, repo, DEFAULT_DICT_BATCH_SIZE) } fn build(decoder: &impl SystemDecoder, repo: &impl DictRepository, batch_size: usize) -> Self { @@ -107,20 +97,26 @@ mod tests { const TEST_NUM_1: u64 = 12; const TEST_NUM_2: u64 = 34; const TEST_NUM_3: u64 = 9876; - const TEST_NUM_1_LEN: u8 = 2; - const TEST_NUM_3_LEN: u8 = 4; + const TEST_NUM_1_LEN: DecodedLength = DecodedLength::from(2); + const TEST_NUM_3_LEN: DecodedLength = DecodedLength::from(4); + + fn decoded_value(n: u64) -> DecodedValue { + DecodedValue::new(n.to_string()).unwrap() + } fn get_test_dec_map() -> HashMap { HashMap::from([ - (TEST_WORD_1.to_string(), TEST_NUM_1.to_string()), - (TEST_WORD_2.to_string(), TEST_NUM_2.to_string()), - (TEST_WORD_3.to_string(), TEST_NUM_3.to_string()), - (TEST_WORD_4.to_string(), TEST_NUM_3.to_string()), + (TEST_WORD_1.to_string(), decoded_value(TEST_NUM_1)), + (TEST_WORD_2.to_string(), decoded_value(TEST_NUM_2)), + (TEST_WORD_3.to_string(), decoded_value(TEST_NUM_3)), + (TEST_WORD_4.to_string(), decoded_value(TEST_NUM_3)), ]) } - fn mock_decoding(word: &str) -> DecodedValue { - get_test_dec_map().remove(word).unwrap() + fn mock_decoding(word: &str) -> Result { + get_test_dec_map() + .remove(word) + .ok_or_else(|| CodecError::UnexpectedError("".to_string())) } fn get_test_words() -> Vec { @@ -135,21 +131,10 @@ mod tests { mock! { pub Decoder {} impl SystemDecoder for Decoder { - fn decode(&self, word: &str) -> DecodedValue; + fn decode(&self, word: &str) -> Result; } } - #[test] - fn test_insert_words_empty() { - let words = vec![]; - let mut decoder = MockDecoder::new(); - decoder.expect_decode().returning(|_| DecodedValue::new()); - - let mut lv_map = LenValueMap::new(); - lv_map.insert_words(words, &decoder).unwrap(); - assert!(lv_map.is_empty()); - } - #[test] fn test_single_word() { let words = vec![TEST_WORD_1.to_string()]; diff --git a/lib/src/core/sys_major/rules_pl.rs b/lib/src/core/sys_major/rules_pl.rs index c4b1c26..fd6b3e9 100644 --- a/lib/src/core/sys_major/rules_pl.rs +++ b/lib/src/core/sys_major/rules_pl.rs @@ -139,84 +139,84 @@ mod tests { #[test] fn test_major_dict_pl_decode_0_1() { let decoder = Decoder::new(get_rules()); - let output = decoder.decode("SZSCZ"); + let output = decoder.decode("SZSCZ").unwrap(); assert_eq!(output, "0") } #[test] fn test_major_dict_pl_decode_0_2() { let decoder = Decoder::new(get_rules()); - let output = decoder.decode("SZSICZ"); + let output = decoder.decode("SZSICZ").unwrap(); assert_eq!(output, "") } #[test] fn test_major_dict_pl_decode_0_3() { let decoder = Decoder::new(get_rules()); - let output = decoder.decode("SZCZRZZCZDZSZ"); + let output = decoder.decode("SZCZRZZCZDZSZ").unwrap(); assert_eq!(output, "0") } #[test] fn test_major_dict_pl_decode_0_4() { let decoder = Decoder::new(get_rules()); - let output = decoder.decode("SZCZRZZICZDZSZ"); + let output = decoder.decode("SZCZRZZICZDZSZ").unwrap(); assert_eq!(output, "") } #[test] fn test_major_dict_pl_decode_1_1() { let decoder = Decoder::new(get_rules()); - let output = decoder.decode("SZTCZ"); + let output = decoder.decode("SZTCZ").unwrap(); assert_eq!(output, "1") } #[test] fn test_major_dict_pl_decode_1_2() { let decoder = Decoder::new(get_rules()); - let output = decoder.decode("DZDŻDŹDDZDŻDŹ"); + let output = decoder.decode("DZDŻDŹDDZDŻDŹ").unwrap(); assert_eq!(output, "1") } #[test] fn test_major_dict_pl_decode_1_3() { let decoder = Decoder::new(get_rules()); - let output = decoder.decode("DZDŻDŹDZDZDŻDŹ"); + let output = decoder.decode("DZDŻDŹDZDZDŻDŹ").unwrap(); assert_eq!(output, "") } #[test] fn test_major_dict_pl_decode_2_1() { let decoder = Decoder::new(get_rules()); - let output = decoder.decode("NINNI"); + let output = decoder.decode("NINNI").unwrap(); assert_eq!(output, "2") } #[test] fn test_major_dict_pl_decode_2_2() { let decoder = Decoder::new(get_rules()); - let output = decoder.decode("NININI"); + let output = decoder.decode("NININI").unwrap(); assert_eq!(output, "") } #[test] fn test_major_dict_pl_decode_4_1() { let decoder = Decoder::new(get_rules()); - let output = decoder.decode("RZRRZ"); + let output = decoder.decode("RZRRZ").unwrap(); assert_eq!(output, "4") } #[test] fn test_major_dict_pl_decode_4_2() { let decoder = Decoder::new(get_rules()); - let output = decoder.decode("RZRZRZ"); + let output = decoder.decode("RZRZRZ").unwrap(); assert_eq!(output, "") } #[test] fn test_major_dict_pl_decode_full_1() { let decoder = Decoder::new(get_rules()); - let output = decoder.decode("ATADANAMARALAJAKAGAFAWAPABA"); + let output = decoder.decode("ATADANAMARALAJAKAGAFAWAPABA").unwrap(); assert_eq!(output, "1123456778899") } } diff --git a/lib/src/core/traits.rs b/lib/src/core/traits.rs index a06d663..673403b 100644 --- a/lib/src/core/traits.rs +++ b/lib/src/core/traits.rs @@ -1,22 +1,16 @@ -use crate::core::entities::EncodingResult; -use crate::core::errors::EncoderError; +use crate::core::entities::EncodedValue; +use crate::core::errors::CodecError; -use super::entities::{Dict, DictEntry}; +use super::entities::{DecodedValue, Dict, DictEntry}; use super::errors::RepositoryError; -/// The number value can be encoded as many word sets, -/// but decoded as one number. For partial values, we can use -/// u64, but for the whole decoded value that may be very long, -/// we need a string. -pub type DecodedValue = String; - pub trait SystemDecoder { - fn decode(&self, word: &str) -> DecodedValue; + fn decode(&self, word: &str) -> Result; } pub trait SystemEncoder { - fn initialize(&self) -> Result<(), EncoderError>; - fn encode(&self, word: &str) -> Result; + fn initialize(&self) -> Result<(), CodecError>; + fn encode(&self, word: &str) -> Result; } #[async_trait::async_trait] diff --git a/lib/src/presentation/cli/commands/decode.rs b/lib/src/presentation/cli/commands/decode.rs index d28d708..42db5ea 100644 --- a/lib/src/presentation/cli/commands/decode.rs +++ b/lib/src/presentation/cli/commands/decode.rs @@ -5,6 +5,6 @@ use tracing::debug; pub async fn run(config: DecoderConfig) { debug!("Running greeter with config {:?}", config); let decoder = system::create_decoder(&config.system); - let result = decoder.decode(&config.input); - println!("{}", result); + let result = decoder.decode(&config.input).unwrap(); + println!("{}", result.as_str()); }