From c5466baebb7d92f3059f3374f77b0bf04a18b5a7 Mon Sep 17 00:00:00 2001 From: chodak166 Date: Sun, 28 Dec 2025 19:16:00 +0100 Subject: [PATCH] WIP: encoder --- app/src/app.rs | 17 +- app/src/config.rs | 9 +- app/src/container.rs | 29 ++- lib/Cargo.toml | 1 + lib/src/application/config.rs | 6 + lib/src/application/services.rs | 15 +- lib/src/core/entities.rs | 16 +- lib/src/core/sys_major/encoder.rs | 138 +++++++++---- lib/src/core/sys_major/lvmap.rs | 187 +++++++++--------- lib/src/core/sys_major/rules_pl.rs | 2 +- lib/src/core/system.rs | 18 +- lib/src/core/traits.rs | 9 + .../infrastructure/sqlite_dict_repository.rs | 41 ++++ lib/src/lib.rs | 3 +- lib/src/presentation/cli/cli_args.rs | 12 ++ lib/src/presentation/cli/commands.rs | 1 + lib/src/presentation/cli/commands/decode.rs | 2 +- lib/src/presentation/cli/commands/encode.rs | 10 + .../presentation/cli/commands/import_dict.rs | 9 +- lib/src/presentation/cli/defaults.rs | 4 +- 20 files changed, 352 insertions(+), 177 deletions(-) create mode 100644 lib/src/presentation/cli/commands/encode.rs diff --git a/app/src/app.rs b/app/src/app.rs index 5f8b321..0067796 100644 --- a/app/src/app.rs +++ b/app/src/app.rs @@ -26,7 +26,7 @@ impl Application { debug!("Bootstrapping application..."); - let container = Container::new(&config).await?; + let container = Container::new().await?; Ok(Self { config, @@ -43,12 +43,17 @@ impl Application { Command::Decode(_) => { commands::decode::run(self.config.decoder).await; } + Command::Encode(_) => { + let repo = self.container.create_dict_repo("demo_pl").await?; + + commands::encode::run(self.config.encoder, repo.as_ref()).await; + } Command::ImportDict(_) => { - commands::import_dict::run( - self.config.import_dict, - self.container.dict_repository.clone(), - ) - .await?; + let importer = self + .container + .create_dict_importer(&self.config.import_dict.name) + .await?; + commands::import_dict::run(self.config.import_dict, importer).await?; } } Ok(()) diff --git a/app/src/config.rs b/app/src/config.rs index 7b8b872..28c5a16 100644 --- a/app/src/config.rs +++ b/app/src/config.rs @@ -3,12 +3,13 @@ use config::{Config, Environment, File}; use serde::Deserialize; use applib::cli::{Command, GlobalArgs, defaults::set_defaults}; -use applib::config::{DecoderConfig, ImportDictConfig, ServerConfig}; +use applib::config::*; #[derive(Debug, Deserialize, Clone)] pub struct AppConfig { pub server: ServerConfig, pub decoder: DecoderConfig, + pub encoder: EncoderConfig, pub import_dict: ImportDictConfig, pub log_level: String, } @@ -46,6 +47,12 @@ impl AppConfig { } builder = builder.set_override("decoder.input", cmd_args.input.clone())?; } + Command::Encode(cmd_args) => { + if let Some(name) = &cmd_args.system { + builder = builder.set_override("encoder.system", name.as_str())?; + } + builder = builder.set_override("encoder.input", cmd_args.input.clone())?; + } Command::ImportDict(cmd_args) => { builder = builder.set_override("import_dict.name", cmd_args.name.clone())?; builder = builder.set_override("import_dict.path", cmd_args.path.clone())?; diff --git a/app/src/container.rs b/app/src/container.rs index 3608652..e53a646 100644 --- a/app/src/container.rs +++ b/app/src/container.rs @@ -1,20 +1,29 @@ -use crate::config::AppConfig; -use applib::application::services::DictImporter; +use std::sync::Arc; + +// use crate::config::AppConfig; +use applib::DictImporter; use applib::infrastructure::sqlite_dict_repository::SqliteDictRepository; +use applib::traits::DictRepository; #[derive(Clone)] -pub struct Container { - pub dict_repository: SqliteDictRepository, -} +pub struct Container; impl Container { - pub async fn new(config: &AppConfig) -> anyhow::Result { - let dict_repository = SqliteDictRepository::new("sqlite:app.db").await?; + pub async fn new() -> anyhow::Result { + Ok(Self) + } - Ok(Self { dict_repository }) + pub async fn create_dict_importer(&self, dict_name: &str) -> anyhow::Result { + let repo = self.create_dict_repo(dict_name).await?; + Ok(DictImporter::new(repo)) } - pub fn create_dict_importer(&self) -> DictImporter { - DictImporter::new(&self.dict_repository) + pub async fn create_dict_repo( + &self, + dict_name: &str, + ) -> anyhow::Result> { + let mut dict_repo = SqliteDictRepository::new("sqlite:app.db").await?; + dict_repo.use_dict(dict_name); + Ok(Arc::new(dict_repo)) } } diff --git a/lib/Cargo.toml b/lib/Cargo.toml index 66fc9e7..e36e612 100644 --- a/lib/Cargo.toml +++ b/lib/Cargo.toml @@ -18,6 +18,7 @@ thiserror = "2.0" async-trait = "0.1" parking_lot = "0.12" sqlx = { version = "0.8.6", features = ["runtime-tokio", "sqlite", "chrono", "migrate"] } +futures = "0.3.31" [dev-dependencies] mockall = "0.14.0" diff --git a/lib/src/application/config.rs b/lib/src/application/config.rs index 8c809d4..de84147 100644 --- a/lib/src/application/config.rs +++ b/lib/src/application/config.rs @@ -12,6 +12,12 @@ pub struct DecoderConfig { pub input: String, } +#[derive(Debug, Deserialize, Clone)] +pub struct EncoderConfig { + pub system: System, + pub input: String, +} + #[derive(Debug, Deserialize, Clone)] pub struct ImportDictConfig { pub name: String, diff --git a/lib/src/application/services.rs b/lib/src/application/services.rs index 84fba4d..89fd02f 100644 --- a/lib/src/application/services.rs +++ b/lib/src/application/services.rs @@ -1,18 +1,25 @@ +use std::sync::Arc; + use crate::core::traits::{DictRepository, DictSource}; -pub struct DictImporter<'a, R> { - repo: &'a R, +pub struct DictImporter { + repo: Arc, batch_size: usize, } -impl<'a, R: DictRepository> DictImporter<'a, R> { - pub fn new(repo: &'a R) -> Self { +impl DictImporter { + pub fn new(repo: Arc) -> Self { Self { repo, batch_size: 1000, // reasonable default } } + pub fn with_batch_size(mut self, batch_size: usize) -> Self { + self.batch_size = batch_size; + self + } + pub async fn import(&self, mut source: impl DictSource) -> Result<(), anyhow::Error> { // 1. Ensure Dict exists (Logic: Create if new, or maybe clear existing?) self.repo.create_dict().await?; diff --git a/lib/src/core/entities.rs b/lib/src/core/entities.rs index 68e38bd..93b7808 100644 --- a/lib/src/core/entities.rs +++ b/lib/src/core/entities.rs @@ -5,16 +5,26 @@ use std::{collections::HashMap, u64}; /// A number encoded as a sequence of words #[derive(Debug, Clone)] -pub struct EncodedValue(Vec); +pub struct EncodedPart { + pub value: u64, + pub words: Vec, +} + +/// A way (variant) to split input number +pub type EncodedSplit = Vec; + +/// A number encoded as words, split in multiple ways +#[derive(Debug, Clone)] +pub struct EncodedValue(Vec); impl EncodedValue { - pub fn new(data: Vec) -> Self { + pub fn new(data: Vec) -> Self { EncodedValue(data) } } impl Deref for EncodedValue { - type Target = Vec; + type Target = Vec; fn deref(&self) -> &Self::Target { &self.0 diff --git a/lib/src/core/sys_major/encoder.rs b/lib/src/core/sys_major/encoder.rs index fbe18f5..596f789 100644 --- a/lib/src/core/sys_major/encoder.rs +++ b/lib/src/core/sys_major/encoder.rs @@ -1,4 +1,11 @@ -use crate::core::{entities::EncodedValue, errors::CodecError, sys_major::LenValueMap, traits::*}; +use core::num; + +use crate::core::{ + entities::{EncodedPart, EncodedSplit, EncodedValue}, + errors::CodecError, + sys_major::LenValueMap, + traits::*, +}; #[derive(Debug)] pub struct Encoder { @@ -15,69 +22,95 @@ impl SystemEncoder for Encoder { fn initialize(&self) -> Result<(), CodecError> { Ok(()) } - fn encode(&self, word: &str) -> Result { - let size = word.chars().count(); + + fn encode(&self, input: &str) -> Result { + let size = input.chars().count(); let max_mask: usize = (1 << (size - 1)) - 1; - let indices: Vec = word.char_indices().map(|(i, _)| i).collect(); + let indices: Vec = input.char_indices().map(|(i, _)| i).collect(); let mut results = Vec::with_capacity(max_mask); for mask in 0..=max_mask { - let mut parts = Vec::new(); - let mut last_split = 0; + let mut parts: Vec = Vec::new(); + let mut last_split = input.char_indices().count(); // we go from right to left to start with the longest parts // Iterate through the mask bits to find where to split for i in 0..size - 1 { // Check if the i-th bit is set if (mask >> i) & 1 == 1 { // The split corresponds to the byte index of the (i+1)-th character - let split_idx = indices[i + 1]; - parts.push(&word[last_split..split_idx]); + let split_idx = indices[indices.len() - i - 1]; + parts.push(input[split_idx..last_split].to_string()); last_split = split_idx; } } // Push the remaining part of the string - parts.push(&word[last_split..]); - - // Calculate metrics for sorting - let num_parts = parts.len(); - - // To find the "most equal" size, we minimize the sum of squared lengths. - // (This mathematically minimizes variance without needing floating point math) - let sum_sq_len: usize = parts.iter().map(|p| p.chars().count().pow(2)).sum(); + parts.push(input[..last_split].to_string()); + + let mut all_matched = true; + let mut split = EncodedSplit::new(); + parts.reverse(); + + for part in &parts { + let Ok(num_part) = part.parse::() else { + all_matched = false; + break; + }; + let Some(words) = self.lv_map.get(part.len() as u8, num_part) else { + all_matched = false; + break; + }; + split.push(EncodedPart { + value: num_part, + words: words.clone(), + }); + } - // Construct the final string representation (e.g., "abc|de|fg") - let result_string = parts.join("|"); + if all_matched { + results.push(Partition { + value: split, + // To find the "most equal" size, we minimize the sum of squared lengths. + // (This mathematically minimizes variance without needing floating point math) + sum_sq_len: parts.iter().map(|p| p.chars().count().pow(2)).sum(), + }); + } - results.push(Partition { - word: result_string, - num_parts, - sum_sq_len, - }); + // Calculate metrics for sorting + // let num_parts = parts.len(); + + // // To find the "most equal" size, we minimize the sum of squared lengths. + // // (This mathematically minimizes variance without needing floating point math) + // let sum_sq_len: usize = parts.iter().map(|p| p.chars().count().pow(2)).sum(); + + // if let Some(words) = self.lv_map.get(size as u8, input.parse().unwrap()) { + // results.push(Partition { + // parts: words.clone(), + // sum_sq_len, + // }); + // } } // Ok(EncodedValue::new(words)) // Sort by: // 1. Fewer parts first (1 part, then 2 parts...) // 2. Most equal lengths (lower sum of squared lengths is more balanced) - // 3. Lexicographically (for deterministic stability) + // 3. TODO: Lexicographically (for deterministic stability)? results.sort_by(|a, b| { - a.num_parts - .cmp(&b.num_parts) + a.value + .len() + .cmp(&b.value.len()) .then(a.sum_sq_len.cmp(&b.sum_sq_len)) - .then(a.word.cmp(&b.word)) }); // Extract just the strings - let words = results.into_iter().map(|p| p.word).collect(); - Ok(EncodedValue::new(words)) + let split_results = results.into_iter().map(|p| p.value).collect(); + Ok(EncodedValue::new(split_results)) } } -// A helper struct to keep the string and its sort metrics together +// A helper struct to keep the split variant and its sort metrics together struct Partition { - word: String, - num_parts: usize, + value: EncodedSplit, sum_sq_len: usize, } @@ -95,9 +128,12 @@ mod tests { let encoder = Encoder::new(lvmap); let result = encoder.encode("345").unwrap(); - assert_eq!(result.len(), 2); - assert!(result.contains(&"test_345_1".into())); - assert!(result.contains(&"test_345_2".into())); + assert_eq!(result.len(), 1); // single split + assert_eq!(result[0].len(), 1); // single part + assert_eq!(result[0][0].value, 345); + assert_eq!(result[0][0].words.len(), 2); // two words + assert_eq!(result[0][0].words[0], "test_345_1"); + assert_eq!(result[0][0].words[1], "test_345_2"); } #[test] @@ -106,6 +142,7 @@ mod tests { lvmap.push(1, 0, "test_0"); lvmap.push(1, 9, "test_9"); lvmap.push(1, 8, "test_8"); + lvmap.push(1, 7, "test_7"); lvmap.push(2, 98, "test_98"); lvmap.push(2, 87, "test_87"); lvmap.push(3, 987, "test_987"); @@ -114,11 +151,30 @@ mod tests { let encoder = Encoder::new(lvmap); let result = encoder.encode("987").unwrap(); - assert_eq!(result.len(), 5); - assert!(result.contains(&"test_987".into())); - assert!(result.contains(&"test_98".into())); - assert!(result.contains(&"test_87".into())); - assert!(result.contains(&"test_9".into())); - assert!(result.contains(&"test_8".into())); + assert_eq!(result.len(), 4); // 987, 98|7, 9|87, 9|8|7 + assert_eq!(result[0].len(), 1); // 987 + + assert_eq!(result[0][0].words.len(), 1); + assert_eq!(result[0][0].words[0], "test_987"); + + assert_eq!(result[1].len(), 2); // 98|7 + assert_eq!(result[1][0].words.len(), 1); + assert_eq!(result[1][0].words[0], "test_98"); + assert_eq!(result[1][1].words.len(), 1); + assert_eq!(result[1][1].words[0], "test_7"); + + assert_eq!(result[2].len(), 2); // 9|87 + assert_eq!(result[2][0].words.len(), 1); + assert_eq!(result[2][0].words[0], "test_9"); + assert_eq!(result[2][1].words.len(), 1); + assert_eq!(result[2][1].words[0], "test_87"); + + assert_eq!(result[3].len(), 3); // 9|8|7 + assert_eq!(result[3][0].words.len(), 1); + assert_eq!(result[3][0].words[0], "test_9"); + assert_eq!(result[3][1].words.len(), 1); + assert_eq!(result[3][1].words[0], "test_8"); + assert_eq!(result[3][2].words.len(), 1); + assert_eq!(result[3][2].words[0], "test_7"); } } diff --git a/lib/src/core/sys_major/lvmap.rs b/lib/src/core/sys_major/lvmap.rs index e9ed4c1..9620ffd 100644 --- a/lib/src/core/sys_major/lvmap.rs +++ b/lib/src/core/sys_major/lvmap.rs @@ -1,4 +1,9 @@ -use crate::core::{DictRepository, SystemDecoder, entities::DecodedLength, errors::CodecError}; +use crate::core::{ + DictRepository, SystemDecoder, + entities::DecodedLength, + errors::{CodecError, RepositoryError}, +}; +use futures::{Stream, StreamExt, TryStreamExt}; use std::{collections::HashMap, hash::Hash, num::ParseIntError}; use thiserror::Error; @@ -26,6 +31,9 @@ pub enum LenValueMapError { #[error(transparent)] Codec(#[from] CodecError), + #[error(transparent)] + Repository(#[from] RepositoryError), + #[error("unable to build encoder data: {0}")] Build(String), } @@ -61,6 +69,10 @@ impl LenValueMap { self } + pub fn get(&self, len: u8, num: DecodedNumber) -> Option<&Vec> { + self.data.get(&DecodedLength::from(len))?.get(&num) + } + pub fn insert_words( &mut self, words: I, @@ -89,38 +101,28 @@ impl LenValueMap { Self { data: data } } - pub async fn from_dict( - decoder: &impl SystemDecoder, - repo: &impl DictRepository, - ) -> Result { - Self::build(decoder, repo, DEFAULT_DICT_BATCH_SIZE).await - } - - async fn build( - decoder: &impl SystemDecoder, - repo: &impl DictRepository, - batch_size: usize, - ) -> Result { + pub async fn from_stream( + mut stream: S, + decoder: &dyn SystemDecoder, + ) -> Result + where + // S is a stream of "Result, Error>" + S: Stream, crate::core::errors::RepositoryError>> + Unpin, + { let mut map = LenValueMap::new(); - let mut offset = 0; - const MAX_OFFSET: usize = 10_000_000; - - loop { - let dict = repo - .fetch_many(batch_size, offset) - .await - .map_err(|e| LenValueMapError::Build(e.to_string()))?; - - if dict.entries.is_empty() { - break; - } - let words: Vec = dict.entries.into_iter().map(|entry| entry.1.text).collect(); - map.insert_words(words, decoder)?; - - offset += batch_size; - if offset >= MAX_OFFSET { - break; + // We stream the batches one by one. + // This ensures only one batch is in memory at a time. + while let Some(batch_result) = stream.next().await { + match batch_result { + Ok(batch) => { + // We delegate to the synchronous logic for the heavy lifting + map.insert_words(batch, decoder)?; + } + Err(e) => { + // Convert RepositoryError to LenValueMapError::Build + return Err(e.into()); + } } } @@ -133,6 +135,8 @@ mod tests { use super::*; use crate::core::{entities::*, errors::*}; use async_trait::async_trait; + use futures::stream; + use std::collections::HashMap; use mockall::{Sequence, automock}; @@ -260,22 +264,6 @@ mod tests { // --- build --- - mock! { - pub Repo {} - - #[async_trait] - impl DictRepository for Repo { - async fn create_dict(&self) -> Result<(), RepositoryError>; - fn use_dict(&mut self, _name: &str); - async fn save_entries(&self, _entry: &[DictEntry]) -> Result<(), RepositoryError>; - async fn fetch_many( - &self, - limit: usize, - offset: usize, - ) -> Result; - } - } - fn dict_with_words(words: &[&str]) -> Dict { let mut dict = Dict::new("default".into()); @@ -287,91 +275,94 @@ mod tests { } #[tokio::test] - async fn test_build_single_batch() { - let mut repo = MockRepo::new(); + async fn test_from_stream_success() { + // 1. Setup Mocks (Same as before) let mut decoder = MockDecoder::new(); - let mut seq = Sequence::new(); - decoder .expect_decode() .returning(|word| mock_decoding(word)); - // FIRST CALL expectation (will be called first) - repo.expect_fetch_many() - .times(1) // Explicitly expect 1 call - .in_sequence(&mut seq) // Enforce order - .returning(|_, _| Ok(dict_with_words(&[TEST_WORD_1]))); + // 2. Prepare Data + // We wrap the inner Vecs in Ok() because the stream expects Result, RepositoryError> + let batches = vec![ + Ok(vec![TEST_WORD_1.into(), TEST_WORD_2.into()]), + Ok(vec![TEST_WORD_3.into(), TEST_WORD_4.into()]), + ]; - // SECOND CALL expectation (will be called second) - repo.expect_fetch_many() - .times(1) - .in_sequence(&mut seq) - .returning(|_, _| Ok(Dict::new("default_dict".into()))); + // 3. Create a Stream from the Vec + // stream::iter converts an IntoIterator into a Stream + let stream = stream::iter(batches); - let data = LenValueMap::build(&decoder, &repo, 1) + // 4. Inject the stream (Dependency Injection) + let map = LenValueMap::from_stream(stream, &decoder) .await - .unwrap() - .into_data(); + .expect("Should build map successfully"); - assert_eq!(data.len(), 1); + // 5. Assertions + let data = map.into_data(); + assert_eq!(data.len(), 2); + assert!(data.contains_key(&TEST_NUM_1_LEN)); + assert!(data.contains_key(&TEST_NUM_3_LEN)); } #[tokio::test] - async fn test_build_multiple_batches() { - let mut repo = MockRepo::new(); + async fn test_from_stream_failure() { let mut decoder = MockDecoder::new(); - let mut seq = Sequence::new(); - decoder .expect_decode() .returning(|word| mock_decoding(word)); - repo.expect_fetch_many() - .times(1) // Explicitly expect 1 call - .in_sequence(&mut seq) // Enforce order - .returning(|_, _| Ok(dict_with_words(&[TEST_WORD_1]))); + let batches = vec![ + Ok(vec![TEST_WORD_1.into()]), + Err(RepositoryError::ConnectionFailed), + Ok(vec![TEST_WORD_3.into()]), + ]; - repo.expect_fetch_many() - .times(1) // Explicitly expect 1 call - .in_sequence(&mut seq) // Enforce order - .returning(|_, _| Ok(dict_with_words(&[TEST_WORD_3]))); // word with different decoded length + let stream = stream::iter(batches); + let result = LenValueMap::from_stream(stream, &decoder).await; - repo.expect_fetch_many() - .times(1) - .in_sequence(&mut seq) - .returning(|_, _| Ok(Dict::new("default_dict".into()))); - - let data = LenValueMap::build(&decoder, &repo, 1) - .await - .unwrap() - .into_data(); - - assert_eq!(data.len(), 2); + match result { + // We match specifically on the Repository variant and the ConnectionFailed inner error + Err(LenValueMapError::Repository(RepositoryError::ConnectionFailed)) => { + // Success! The correct error type propagated up. + } + // If it's any other error (including a stringified one), we fail + _ => panic!( + "Expected LenValueMapError::Repository(ConnectionFailed), got {:?}", + result + ), + } } // #[tokio::test] - // async fn test_build_fetches_multiple_batches() { + // async fn test_build_multiple_batches() { // let mut repo = MockRepo::new(); // let mut decoder = MockDecoder::new(); + // let mut seq = Sequence::new(); // decoder // .expect_decode() // .returning(|word| mock_decoding(word)); // repo.expect_fetch_many() - // .with(eq("default"), eq(Some(2)), eq(Some(0))) - // .return_once(|_, _, _| Ok(dict_with_words(&[TEST_WORD_1, TEST_WORD_2]))); + // .times(1) // Explicitly expect 1 call + // .in_sequence(&mut seq) // Enforce order + // .returning(|_, _| Ok(dict_with_words(&[TEST_WORD_1]))); // repo.expect_fetch_many() - // .with(eq("default"), eq(Some(2)), eq(Some(2))) - // .return_once(|_, _, _| Ok(dict_with_words(&[TEST_WORD_3, TEST_WORD_4]))); + // .times(1) // Explicitly expect 1 call + // .in_sequence(&mut seq) // Enforce order + // .returning(|_, _| Ok(dict_with_words(&[TEST_WORD_3]))); // word with different decoded length // repo.expect_fetch_many() - // .with(eq("default"), eq(Some(2)), eq(Some(4))) - // .return_once(|_, _, _| Ok(Dict::new("default".into()))); - - // let map = LenValueMap::build(&decoder, &repo, 2).await; - // let data = map.into_data(); + // .times(1) + // .in_sequence(&mut seq) + // .returning(|_, _| Ok(Dict::new("default_dict".into()))); + + // let data = LenValueMap::build(&decoder, &repo, 1) + // .await + // .unwrap() + // .into_data(); // assert_eq!(data.len(), 2); // } diff --git a/lib/src/core/sys_major/rules_pl.rs b/lib/src/core/sys_major/rules_pl.rs index fd6b3e9..0cdf587 100644 --- a/lib/src/core/sys_major/rules_pl.rs +++ b/lib/src/core/sys_major/rules_pl.rs @@ -133,8 +133,8 @@ pub fn get_rules() -> Rules { #[cfg(test)] mod tests { use super::*; - use crate::SystemDecoder; use crate::core::sys_major::Decoder; + use crate::traits::SystemDecoder; #[test] fn test_major_dict_pl_decode_0_1() { diff --git a/lib/src/core/system.rs b/lib/src/core/system.rs index a8f5819..3954ba1 100644 --- a/lib/src/core/system.rs +++ b/lib/src/core/system.rs @@ -1,7 +1,7 @@ use serde::Deserialize; -use crate::core::SystemDecoder; -use crate::core::sys_major as major; +use crate::core::sys_major::{self as major, LenValueMap}; +use crate::core::{DictRepository, SystemDecoder, SystemEncoder}; #[derive(Debug, Clone, Copy, PartialEq, Eq, Deserialize)] pub enum System { @@ -28,3 +28,17 @@ pub fn create_decoder(system: &System) -> Box { System::MajorEn => Box::new(major::Decoder::new(major::rules_en::get_rules())), } } + +pub async fn create_encoder(system: &System, dict: &dyn DictRepository) -> Box { + // let decoder = create_decoder(&system); + let decoder = major::Decoder::new(match system { + System::MajorPl => major::rules_pl::get_rules(), + System::MajorEn => major::rules_en::get_rules(), + }); + let stream = dict.stream_batches(100).await.unwrap(); // TODO + let lvmap = LenValueMap::from_stream(stream, &decoder).await.unwrap(); // TODO + match system { + System::MajorPl => Box::new(major::Encoder::new(lvmap)), + System::MajorEn => Box::new(major::Encoder::new(lvmap)), + } +} diff --git a/lib/src/core/traits.rs b/lib/src/core/traits.rs index 34d56d6..1e1218c 100644 --- a/lib/src/core/traits.rs +++ b/lib/src/core/traits.rs @@ -1,3 +1,5 @@ +use futures::stream::BoxStream; + use crate::core::entities::EncodedValue; use crate::core::errors::CodecError; @@ -26,6 +28,13 @@ pub trait DictRepository: Send + Sync { /// Fetch a page of entries. async fn fetch_many(&self, limit: usize, offset: usize) -> Result; + + /// Returns a cold stream that fetches strings in chunks. + /// The stream yields `Result, RepositoryError>`. + async fn stream_batches( + &self, + batch_size: usize, + ) -> Result, RepositoryError>>, RepositoryError>; } pub trait DictSource { diff --git a/lib/src/infrastructure/sqlite_dict_repository.rs b/lib/src/infrastructure/sqlite_dict_repository.rs index 08b042a..d2e794e 100644 --- a/lib/src/infrastructure/sqlite_dict_repository.rs +++ b/lib/src/infrastructure/sqlite_dict_repository.rs @@ -1,6 +1,9 @@ use crate::core::entities::{Dict, DictEntry}; use crate::core::errors::RepositoryError; use crate::core::traits::DictRepository; +use futures::stream::BoxStream; +use futures::{StreamExt, TryStreamExt}; + use sqlx::{Row, SqlitePool, sqlite::SqliteConnectOptions}; use std::collections::HashMap; use std::str::FromStr; @@ -187,4 +190,42 @@ impl DictRepository for SqliteDictRepository { entries: entries_map, }) } + + async fn stream_batches( + &self, + batch_size: usize, + ) -> Result, RepositoryError>>, RepositoryError> { + // 1. Resolve ID first + let dict_id = self.get_dict_id().await?; + + // 2. Create the base query stream. + // We do NOT use limit/offset. We let the DB stream rows via a cursor. + let query_stream = sqlx::query("SELECT text FROM entries WHERE dictionary_id = ?") + .bind(dict_id) + .fetch(&self.pool); + + // 3. Transform the stream using Functional combinators + let stream = query_stream + // Map SQLx errors to Domain errors + .map_err(|e| RepositoryError::StorageError(e.to_string())) + // Extract the String from the Row + .and_then(|row| async move { + // 'text' is the column name + let text: String = row + .try_get("text") + .map_err(|e| RepositoryError::StorageError(e.to_string()))?; + Ok(text) + }) + // Group items into vectors of size `batch_size` + .try_chunks(batch_size) + // try_chunks returns a specific error type on failure, map it back + .map_err(|e| { + // logic to handle leftover elements if error occurs, + // but for simplicity, we treat stream errors as fatal here + RepositoryError::StorageError(e.to_string()) + }); + + // 4. Box the stream to erase the complex iterator type (Type Erasure) + Ok(Box::pin(stream)) + } } diff --git a/lib/src/lib.rs b/lib/src/lib.rs index cd02344..bb90b6d 100644 --- a/lib/src/lib.rs +++ b/lib/src/lib.rs @@ -4,6 +4,7 @@ pub mod infrastructure; mod presentation; pub use self::application::config; +pub use self::application::services::DictImporter; pub use self::core::system; -pub use self::core::traits::SystemDecoder; +pub use self::core::traits; pub use self::presentation::cli; diff --git a/lib/src/presentation/cli/cli_args.rs b/lib/src/presentation/cli/cli_args.rs index 8a05bd4..e7994df 100644 --- a/lib/src/presentation/cli/cli_args.rs +++ b/lib/src/presentation/cli/cli_args.rs @@ -30,6 +30,9 @@ pub enum Command { /// Decode a word using given system Decode(DecodeArgs), + /// Encode a number using given system + Encode(EncodeArgs), + /// Import dictionary ImportDict(ImportDictArgs), } @@ -42,6 +45,15 @@ pub struct ServerArgs { #[derive(ClapArgs, Debug, Clone)] pub struct DecodeArgs { + #[arg(long, help = defaults::HELP_DEC_SYSTEM)] + pub system: Option, + + #[arg(long, help = defaults::HELP_DEC_INPUT)] + pub input: String, +} + +#[derive(ClapArgs, Debug, Clone)] +pub struct EncodeArgs { #[arg(long, help = defaults::HELP_ENC_SYSTEM)] pub system: Option, diff --git a/lib/src/presentation/cli/commands.rs b/lib/src/presentation/cli/commands.rs index 0c7b143..5b766bb 100644 --- a/lib/src/presentation/cli/commands.rs +++ b/lib/src/presentation/cli/commands.rs @@ -1,3 +1,4 @@ pub mod decode; +pub mod encode; pub mod import_dict; pub mod server; diff --git a/lib/src/presentation/cli/commands/decode.rs b/lib/src/presentation/cli/commands/decode.rs index 42db5ea..a95cda5 100644 --- a/lib/src/presentation/cli/commands/decode.rs +++ b/lib/src/presentation/cli/commands/decode.rs @@ -3,7 +3,7 @@ use crate::core::system; use tracing::debug; pub async fn run(config: DecoderConfig) { - debug!("Running greeter with config {:?}", config); + debug!("Running decoder with config {:?}", config); let decoder = system::create_decoder(&config.system); let result = decoder.decode(&config.input).unwrap(); println!("{}", result.as_str()); diff --git a/lib/src/presentation/cli/commands/encode.rs b/lib/src/presentation/cli/commands/encode.rs new file mode 100644 index 0000000..c78704f --- /dev/null +++ b/lib/src/presentation/cli/commands/encode.rs @@ -0,0 +1,10 @@ +use crate::application::config::EncoderConfig; +use crate::core::{DictRepository, system}; +use tracing::debug; + +pub async fn run(config: EncoderConfig, dict: &dyn DictRepository) { + debug!("Running encoder with config {:?}", config); + let encoder = system::create_encoder(&config.system, dict).await; + let result = encoder.encode(&config.input); + println!("{:?}", result); +} diff --git a/lib/src/presentation/cli/commands/import_dict.rs b/lib/src/presentation/cli/commands/import_dict.rs index eebd650..40dc935 100644 --- a/lib/src/presentation/cli/commands/import_dict.rs +++ b/lib/src/presentation/cli/commands/import_dict.rs @@ -1,12 +1,8 @@ use crate::application::{config::ImportDictConfig, services::DictImporter}; -use crate::core::traits::DictRepository; use crate::infrastructure::json_file_dict_source::JsonFileDictSource; use tracing::{debug, error, info}; -pub async fn run( - config: ImportDictConfig, - repository: R, -) -> Result<(), anyhow::Error> { +pub async fn run(config: ImportDictConfig, importer: DictImporter) -> Result<(), anyhow::Error> { debug!("Importing dict with config {:?}", config); info!( @@ -17,9 +13,6 @@ pub async fn run( // Create the JSON file source (will auto-generate IDs starting from 1) let source = JsonFileDictSource::new(&config.path)?; - // Create the importer - let importer = DictImporter::new(&repository); - // Perform the import (this will call create() first) match importer.import(source).await { Ok(()) => { diff --git a/lib/src/presentation/cli/defaults.rs b/lib/src/presentation/cli/defaults.rs index f665002..0e6430f 100644 --- a/lib/src/presentation/cli/defaults.rs +++ b/lib/src/presentation/cli/defaults.rs @@ -10,8 +10,10 @@ pub const SYSTEM_NAME: &str = "major_pl"; pub const HELP_PORT: &str = formatcp!("Override Port [default: {}]", PORT); pub const HELP_LOG: &str = formatcp!("Override Log Level [default: {}]", LOG_LEVEL); +pub const HELP_DEC_SYSTEM: &str = formatcp!("System to use [default: {}]", SYSTEM_NAME); +pub const HELP_DEC_INPUT: &str = formatcp!("Text to decode"); pub const HELP_ENC_SYSTEM: &str = formatcp!("System to use [default: {}]", SYSTEM_NAME); -pub const HELP_ENC_INPUT: &str = formatcp!("Text to decode"); +pub const HELP_ENC_INPUT: &str = formatcp!("Number to encode"); pub const HELP_IMPORT_DICT_NAME: &str = formatcp!("Dictionary name"); pub const HELP_IMPORT_DICT_INPUT: &str = formatcp!("Dictionary file path");