Browse Source

WIP: encoder

develop-refactor
chodak166 4 months ago
parent
commit
c5466baebb
  1. 17
      app/src/app.rs
  2. 9
      app/src/config.rs
  3. 29
      app/src/container.rs
  4. 1
      lib/Cargo.toml
  5. 6
      lib/src/application/config.rs
  6. 15
      lib/src/application/services.rs
  7. 16
      lib/src/core/entities.rs
  8. 138
      lib/src/core/sys_major/encoder.rs
  9. 187
      lib/src/core/sys_major/lvmap.rs
  10. 2
      lib/src/core/sys_major/rules_pl.rs
  11. 18
      lib/src/core/system.rs
  12. 9
      lib/src/core/traits.rs
  13. 41
      lib/src/infrastructure/sqlite_dict_repository.rs
  14. 3
      lib/src/lib.rs
  15. 12
      lib/src/presentation/cli/cli_args.rs
  16. 1
      lib/src/presentation/cli/commands.rs
  17. 2
      lib/src/presentation/cli/commands/decode.rs
  18. 10
      lib/src/presentation/cli/commands/encode.rs
  19. 9
      lib/src/presentation/cli/commands/import_dict.rs
  20. 4
      lib/src/presentation/cli/defaults.rs

17
app/src/app.rs

@ -26,7 +26,7 @@ impl Application {
debug!("Bootstrapping application..."); debug!("Bootstrapping application...");
let container = Container::new(&config).await?; let container = Container::new().await?;
Ok(Self { Ok(Self {
config, config,
@ -43,12 +43,17 @@ impl Application {
Command::Decode(_) => { Command::Decode(_) => {
commands::decode::run(self.config.decoder).await; commands::decode::run(self.config.decoder).await;
} }
Command::Encode(_) => {
let repo = self.container.create_dict_repo("demo_pl").await?;
commands::encode::run(self.config.encoder, repo.as_ref()).await;
}
Command::ImportDict(_) => { Command::ImportDict(_) => {
commands::import_dict::run( let importer = self
self.config.import_dict, .container
self.container.dict_repository.clone(), .create_dict_importer(&self.config.import_dict.name)
) .await?;
.await?; commands::import_dict::run(self.config.import_dict, importer).await?;
} }
} }
Ok(()) Ok(())

9
app/src/config.rs

@ -3,12 +3,13 @@ use config::{Config, Environment, File};
use serde::Deserialize; use serde::Deserialize;
use applib::cli::{Command, GlobalArgs, defaults::set_defaults}; use applib::cli::{Command, GlobalArgs, defaults::set_defaults};
use applib::config::{DecoderConfig, ImportDictConfig, ServerConfig}; use applib::config::*;
#[derive(Debug, Deserialize, Clone)] #[derive(Debug, Deserialize, Clone)]
pub struct AppConfig { pub struct AppConfig {
pub server: ServerConfig, pub server: ServerConfig,
pub decoder: DecoderConfig, pub decoder: DecoderConfig,
pub encoder: EncoderConfig,
pub import_dict: ImportDictConfig, pub import_dict: ImportDictConfig,
pub log_level: String, pub log_level: String,
} }
@ -46,6 +47,12 @@ impl AppConfig {
} }
builder = builder.set_override("decoder.input", cmd_args.input.clone())?; builder = builder.set_override("decoder.input", cmd_args.input.clone())?;
} }
Command::Encode(cmd_args) => {
if let Some(name) = &cmd_args.system {
builder = builder.set_override("encoder.system", name.as_str())?;
}
builder = builder.set_override("encoder.input", cmd_args.input.clone())?;
}
Command::ImportDict(cmd_args) => { Command::ImportDict(cmd_args) => {
builder = builder.set_override("import_dict.name", cmd_args.name.clone())?; builder = builder.set_override("import_dict.name", cmd_args.name.clone())?;
builder = builder.set_override("import_dict.path", cmd_args.path.clone())?; builder = builder.set_override("import_dict.path", cmd_args.path.clone())?;

29
app/src/container.rs

@ -1,20 +1,29 @@
use crate::config::AppConfig; use std::sync::Arc;
use applib::application::services::DictImporter;
// use crate::config::AppConfig;
use applib::DictImporter;
use applib::infrastructure::sqlite_dict_repository::SqliteDictRepository; use applib::infrastructure::sqlite_dict_repository::SqliteDictRepository;
use applib::traits::DictRepository;
#[derive(Clone)] #[derive(Clone)]
pub struct Container { pub struct Container;
pub dict_repository: SqliteDictRepository,
}
impl Container { impl Container {
pub async fn new(config: &AppConfig) -> anyhow::Result<Self> { pub async fn new() -> anyhow::Result<Self> {
let dict_repository = SqliteDictRepository::new("sqlite:app.db").await?; Ok(Self)
}
Ok(Self { dict_repository }) pub async fn create_dict_importer(&self, dict_name: &str) -> anyhow::Result<DictImporter> {
let repo = self.create_dict_repo(dict_name).await?;
Ok(DictImporter::new(repo))
} }
pub fn create_dict_importer(&self) -> DictImporter<SqliteDictRepository> { pub async fn create_dict_repo(
DictImporter::new(&self.dict_repository) &self,
dict_name: &str,
) -> anyhow::Result<Arc<dyn DictRepository>> {
let mut dict_repo = SqliteDictRepository::new("sqlite:app.db").await?;
dict_repo.use_dict(dict_name);
Ok(Arc::new(dict_repo))
} }
} }

1
lib/Cargo.toml

@ -18,6 +18,7 @@ thiserror = "2.0"
async-trait = "0.1" async-trait = "0.1"
parking_lot = "0.12" parking_lot = "0.12"
sqlx = { version = "0.8.6", features = ["runtime-tokio", "sqlite", "chrono", "migrate"] } sqlx = { version = "0.8.6", features = ["runtime-tokio", "sqlite", "chrono", "migrate"] }
futures = "0.3.31"
[dev-dependencies] [dev-dependencies]
mockall = "0.14.0" mockall = "0.14.0"

6
lib/src/application/config.rs

@ -12,6 +12,12 @@ pub struct DecoderConfig {
pub input: String, pub input: String,
} }
#[derive(Debug, Deserialize, Clone)]
pub struct EncoderConfig {
pub system: System,
pub input: String,
}
#[derive(Debug, Deserialize, Clone)] #[derive(Debug, Deserialize, Clone)]
pub struct ImportDictConfig { pub struct ImportDictConfig {
pub name: String, pub name: String,

15
lib/src/application/services.rs

@ -1,18 +1,25 @@
use std::sync::Arc;
use crate::core::traits::{DictRepository, DictSource}; use crate::core::traits::{DictRepository, DictSource};
pub struct DictImporter<'a, R> { pub struct DictImporter {
repo: &'a R, repo: Arc<dyn DictRepository>,
batch_size: usize, batch_size: usize,
} }
impl<'a, R: DictRepository> DictImporter<'a, R> { impl DictImporter {
pub fn new(repo: &'a R) -> Self { pub fn new(repo: Arc<dyn DictRepository>) -> Self {
Self { Self {
repo, repo,
batch_size: 1000, // reasonable default batch_size: 1000, // reasonable default
} }
} }
pub fn with_batch_size(mut self, batch_size: usize) -> Self {
self.batch_size = batch_size;
self
}
pub async fn import(&self, mut source: impl DictSource) -> Result<(), anyhow::Error> { pub async fn import(&self, mut source: impl DictSource) -> Result<(), anyhow::Error> {
// 1. Ensure Dict exists (Logic: Create if new, or maybe clear existing?) // 1. Ensure Dict exists (Logic: Create if new, or maybe clear existing?)
self.repo.create_dict().await?; self.repo.create_dict().await?;

16
lib/src/core/entities.rs

@ -5,16 +5,26 @@ use std::{collections::HashMap, u64};
/// A number encoded as a sequence of words /// A number encoded as a sequence of words
#[derive(Debug, Clone)] #[derive(Debug, Clone)]
pub struct EncodedValue(Vec<String>); pub struct EncodedPart {
pub value: u64,
pub words: Vec<String>,
}
/// A way (variant) to split input number
pub type EncodedSplit = Vec<EncodedPart>;
/// A number encoded as words, split in multiple ways
#[derive(Debug, Clone)]
pub struct EncodedValue(Vec<EncodedSplit>);
impl EncodedValue { impl EncodedValue {
pub fn new(data: Vec<String>) -> Self { pub fn new(data: Vec<EncodedSplit>) -> Self {
EncodedValue(data) EncodedValue(data)
} }
} }
impl Deref for EncodedValue { impl Deref for EncodedValue {
type Target = Vec<String>; type Target = Vec<EncodedSplit>;
fn deref(&self) -> &Self::Target { fn deref(&self) -> &Self::Target {
&self.0 &self.0

138
lib/src/core/sys_major/encoder.rs

@ -1,4 +1,11 @@
use crate::core::{entities::EncodedValue, errors::CodecError, sys_major::LenValueMap, traits::*}; use core::num;
use crate::core::{
entities::{EncodedPart, EncodedSplit, EncodedValue},
errors::CodecError,
sys_major::LenValueMap,
traits::*,
};
#[derive(Debug)] #[derive(Debug)]
pub struct Encoder { pub struct Encoder {
@ -15,69 +22,95 @@ impl SystemEncoder for Encoder {
fn initialize(&self) -> Result<(), CodecError> { fn initialize(&self) -> Result<(), CodecError> {
Ok(()) Ok(())
} }
fn encode(&self, word: &str) -> Result<EncodedValue, CodecError> {
let size = word.chars().count(); fn encode(&self, input: &str) -> Result<EncodedValue, CodecError> {
let size = input.chars().count();
let max_mask: usize = (1 << (size - 1)) - 1; let max_mask: usize = (1 << (size - 1)) - 1;
let indices: Vec<usize> = word.char_indices().map(|(i, _)| i).collect(); let indices: Vec<usize> = input.char_indices().map(|(i, _)| i).collect();
let mut results = Vec::with_capacity(max_mask); let mut results = Vec::with_capacity(max_mask);
for mask in 0..=max_mask { for mask in 0..=max_mask {
let mut parts = Vec::new(); let mut parts: Vec<String> = Vec::new();
let mut last_split = 0; let mut last_split = input.char_indices().count(); // we go from right to left to start with the longest parts
// Iterate through the mask bits to find where to split // Iterate through the mask bits to find where to split
for i in 0..size - 1 { for i in 0..size - 1 {
// Check if the i-th bit is set // Check if the i-th bit is set
if (mask >> i) & 1 == 1 { if (mask >> i) & 1 == 1 {
// The split corresponds to the byte index of the (i+1)-th character // The split corresponds to the byte index of the (i+1)-th character
let split_idx = indices[i + 1]; let split_idx = indices[indices.len() - i - 1];
parts.push(&word[last_split..split_idx]); parts.push(input[split_idx..last_split].to_string());
last_split = split_idx; last_split = split_idx;
} }
} }
// Push the remaining part of the string // Push the remaining part of the string
parts.push(&word[last_split..]); parts.push(input[..last_split].to_string());
// Calculate metrics for sorting let mut all_matched = true;
let num_parts = parts.len(); let mut split = EncodedSplit::new();
parts.reverse();
// To find the "most equal" size, we minimize the sum of squared lengths.
// (This mathematically minimizes variance without needing floating point math) for part in &parts {
let sum_sq_len: usize = parts.iter().map(|p| p.chars().count().pow(2)).sum(); let Ok(num_part) = part.parse::<u64>() else {
all_matched = false;
break;
};
let Some(words) = self.lv_map.get(part.len() as u8, num_part) else {
all_matched = false;
break;
};
split.push(EncodedPart {
value: num_part,
words: words.clone(),
});
}
// Construct the final string representation (e.g., "abc|de|fg") if all_matched {
let result_string = parts.join("|"); results.push(Partition {
value: split,
// To find the "most equal" size, we minimize the sum of squared lengths.
// (This mathematically minimizes variance without needing floating point math)
sum_sq_len: parts.iter().map(|p| p.chars().count().pow(2)).sum(),
});
}
results.push(Partition { // Calculate metrics for sorting
word: result_string, // let num_parts = parts.len();
num_parts,
sum_sq_len, // // To find the "most equal" size, we minimize the sum of squared lengths.
}); // // (This mathematically minimizes variance without needing floating point math)
// let sum_sq_len: usize = parts.iter().map(|p| p.chars().count().pow(2)).sum();
// if let Some(words) = self.lv_map.get(size as u8, input.parse().unwrap()) {
// results.push(Partition {
// parts: words.clone(),
// sum_sq_len,
// });
// }
} }
// Ok(EncodedValue::new(words)) // Ok(EncodedValue::new(words))
// Sort by: // Sort by:
// 1. Fewer parts first (1 part, then 2 parts...) // 1. Fewer parts first (1 part, then 2 parts...)
// 2. Most equal lengths (lower sum of squared lengths is more balanced) // 2. Most equal lengths (lower sum of squared lengths is more balanced)
// 3. Lexicographically (for deterministic stability) // 3. TODO: Lexicographically (for deterministic stability)?
results.sort_by(|a, b| { results.sort_by(|a, b| {
a.num_parts a.value
.cmp(&b.num_parts) .len()
.cmp(&b.value.len())
.then(a.sum_sq_len.cmp(&b.sum_sq_len)) .then(a.sum_sq_len.cmp(&b.sum_sq_len))
.then(a.word.cmp(&b.word))
}); });
// Extract just the strings // Extract just the strings
let words = results.into_iter().map(|p| p.word).collect(); let split_results = results.into_iter().map(|p| p.value).collect();
Ok(EncodedValue::new(words)) Ok(EncodedValue::new(split_results))
} }
} }
// A helper struct to keep the string and its sort metrics together // A helper struct to keep the split variant and its sort metrics together
struct Partition { struct Partition {
word: String, value: EncodedSplit,
num_parts: usize,
sum_sq_len: usize, sum_sq_len: usize,
} }
@ -95,9 +128,12 @@ mod tests {
let encoder = Encoder::new(lvmap); let encoder = Encoder::new(lvmap);
let result = encoder.encode("345").unwrap(); let result = encoder.encode("345").unwrap();
assert_eq!(result.len(), 2); assert_eq!(result.len(), 1); // single split
assert!(result.contains(&"test_345_1".into())); assert_eq!(result[0].len(), 1); // single part
assert!(result.contains(&"test_345_2".into())); assert_eq!(result[0][0].value, 345);
assert_eq!(result[0][0].words.len(), 2); // two words
assert_eq!(result[0][0].words[0], "test_345_1");
assert_eq!(result[0][0].words[1], "test_345_2");
} }
#[test] #[test]
@ -106,6 +142,7 @@ mod tests {
lvmap.push(1, 0, "test_0"); lvmap.push(1, 0, "test_0");
lvmap.push(1, 9, "test_9"); lvmap.push(1, 9, "test_9");
lvmap.push(1, 8, "test_8"); lvmap.push(1, 8, "test_8");
lvmap.push(1, 7, "test_7");
lvmap.push(2, 98, "test_98"); lvmap.push(2, 98, "test_98");
lvmap.push(2, 87, "test_87"); lvmap.push(2, 87, "test_87");
lvmap.push(3, 987, "test_987"); lvmap.push(3, 987, "test_987");
@ -114,11 +151,30 @@ mod tests {
let encoder = Encoder::new(lvmap); let encoder = Encoder::new(lvmap);
let result = encoder.encode("987").unwrap(); let result = encoder.encode("987").unwrap();
assert_eq!(result.len(), 5); assert_eq!(result.len(), 4); // 987, 98|7, 9|87, 9|8|7
assert!(result.contains(&"test_987".into())); assert_eq!(result[0].len(), 1); // 987
assert!(result.contains(&"test_98".into()));
assert!(result.contains(&"test_87".into())); assert_eq!(result[0][0].words.len(), 1);
assert!(result.contains(&"test_9".into())); assert_eq!(result[0][0].words[0], "test_987");
assert!(result.contains(&"test_8".into()));
assert_eq!(result[1].len(), 2); // 98|7
assert_eq!(result[1][0].words.len(), 1);
assert_eq!(result[1][0].words[0], "test_98");
assert_eq!(result[1][1].words.len(), 1);
assert_eq!(result[1][1].words[0], "test_7");
assert_eq!(result[2].len(), 2); // 9|87
assert_eq!(result[2][0].words.len(), 1);
assert_eq!(result[2][0].words[0], "test_9");
assert_eq!(result[2][1].words.len(), 1);
assert_eq!(result[2][1].words[0], "test_87");
assert_eq!(result[3].len(), 3); // 9|8|7
assert_eq!(result[3][0].words.len(), 1);
assert_eq!(result[3][0].words[0], "test_9");
assert_eq!(result[3][1].words.len(), 1);
assert_eq!(result[3][1].words[0], "test_8");
assert_eq!(result[3][2].words.len(), 1);
assert_eq!(result[3][2].words[0], "test_7");
} }
} }

187
lib/src/core/sys_major/lvmap.rs

@ -1,4 +1,9 @@
use crate::core::{DictRepository, SystemDecoder, entities::DecodedLength, errors::CodecError}; use crate::core::{
DictRepository, SystemDecoder,
entities::DecodedLength,
errors::{CodecError, RepositoryError},
};
use futures::{Stream, StreamExt, TryStreamExt};
use std::{collections::HashMap, hash::Hash, num::ParseIntError}; use std::{collections::HashMap, hash::Hash, num::ParseIntError};
use thiserror::Error; use thiserror::Error;
@ -26,6 +31,9 @@ pub enum LenValueMapError {
#[error(transparent)] #[error(transparent)]
Codec(#[from] CodecError), Codec(#[from] CodecError),
#[error(transparent)]
Repository(#[from] RepositoryError),
#[error("unable to build encoder data: {0}")] #[error("unable to build encoder data: {0}")]
Build(String), Build(String),
} }
@ -61,6 +69,10 @@ impl LenValueMap {
self self
} }
pub fn get(&self, len: u8, num: DecodedNumber) -> Option<&Vec<String>> {
self.data.get(&DecodedLength::from(len))?.get(&num)
}
pub fn insert_words<I>( pub fn insert_words<I>(
&mut self, &mut self,
words: I, words: I,
@ -89,38 +101,28 @@ impl LenValueMap {
Self { data: data } Self { data: data }
} }
pub async fn from_dict( pub async fn from_stream<S>(
decoder: &impl SystemDecoder, mut stream: S,
repo: &impl DictRepository, decoder: &dyn SystemDecoder,
) -> Result<Self, LenValueMapError> { ) -> Result<Self, LenValueMapError>
Self::build(decoder, repo, DEFAULT_DICT_BATCH_SIZE).await where
} // S is a stream of "Result<Vec<String>, Error>"
S: Stream<Item = Result<Vec<String>, crate::core::errors::RepositoryError>> + Unpin,
async fn build( {
decoder: &impl SystemDecoder,
repo: &impl DictRepository,
batch_size: usize,
) -> Result<Self, LenValueMapError> {
let mut map = LenValueMap::new(); let mut map = LenValueMap::new();
let mut offset = 0;
const MAX_OFFSET: usize = 10_000_000;
loop {
let dict = repo
.fetch_many(batch_size, offset)
.await
.map_err(|e| LenValueMapError::Build(e.to_string()))?;
if dict.entries.is_empty() {
break;
}
let words: Vec<String> = dict.entries.into_iter().map(|entry| entry.1.text).collect(); // We stream the batches one by one.
map.insert_words(words, decoder)?; // This ensures only one batch is in memory at a time.
while let Some(batch_result) = stream.next().await {
offset += batch_size; match batch_result {
if offset >= MAX_OFFSET { Ok(batch) => {
break; // We delegate to the synchronous logic for the heavy lifting
map.insert_words(batch, decoder)?;
}
Err(e) => {
// Convert RepositoryError to LenValueMapError::Build
return Err(e.into());
}
} }
} }
@ -133,6 +135,8 @@ mod tests {
use super::*; use super::*;
use crate::core::{entities::*, errors::*}; use crate::core::{entities::*, errors::*};
use async_trait::async_trait; use async_trait::async_trait;
use futures::stream;
use std::collections::HashMap; use std::collections::HashMap;
use mockall::{Sequence, automock}; use mockall::{Sequence, automock};
@ -260,22 +264,6 @@ mod tests {
// --- build --- // --- build ---
mock! {
pub Repo {}
#[async_trait]
impl DictRepository for Repo {
async fn create_dict(&self) -> Result<(), RepositoryError>;
fn use_dict(&mut self, _name: &str);
async fn save_entries(&self, _entry: &[DictEntry]) -> Result<(), RepositoryError>;
async fn fetch_many(
&self,
limit: usize,
offset: usize,
) -> Result<Dict, RepositoryError>;
}
}
fn dict_with_words(words: &[&str]) -> Dict { fn dict_with_words(words: &[&str]) -> Dict {
let mut dict = Dict::new("default".into()); let mut dict = Dict::new("default".into());
@ -287,91 +275,94 @@ mod tests {
} }
#[tokio::test] #[tokio::test]
async fn test_build_single_batch() { async fn test_from_stream_success() {
let mut repo = MockRepo::new(); // 1. Setup Mocks (Same as before)
let mut decoder = MockDecoder::new(); let mut decoder = MockDecoder::new();
let mut seq = Sequence::new();
decoder decoder
.expect_decode() .expect_decode()
.returning(|word| mock_decoding(word)); .returning(|word| mock_decoding(word));
// FIRST CALL expectation (will be called first) // 2. Prepare Data
repo.expect_fetch_many() // We wrap the inner Vecs in Ok() because the stream expects Result<Vec<String>, RepositoryError>
.times(1) // Explicitly expect 1 call let batches = vec![
.in_sequence(&mut seq) // Enforce order Ok(vec![TEST_WORD_1.into(), TEST_WORD_2.into()]),
.returning(|_, _| Ok(dict_with_words(&[TEST_WORD_1]))); Ok(vec![TEST_WORD_3.into(), TEST_WORD_4.into()]),
];
// SECOND CALL expectation (will be called second) // 3. Create a Stream from the Vec
repo.expect_fetch_many() // stream::iter converts an IntoIterator into a Stream
.times(1) let stream = stream::iter(batches);
.in_sequence(&mut seq)
.returning(|_, _| Ok(Dict::new("default_dict".into())));
let data = LenValueMap::build(&decoder, &repo, 1) // 4. Inject the stream (Dependency Injection)
let map = LenValueMap::from_stream(stream, &decoder)
.await .await
.unwrap() .expect("Should build map successfully");
.into_data();
assert_eq!(data.len(), 1); // 5. Assertions
let data = map.into_data();
assert_eq!(data.len(), 2);
assert!(data.contains_key(&TEST_NUM_1_LEN));
assert!(data.contains_key(&TEST_NUM_3_LEN));
} }
#[tokio::test] #[tokio::test]
async fn test_build_multiple_batches() { async fn test_from_stream_failure() {
let mut repo = MockRepo::new();
let mut decoder = MockDecoder::new(); let mut decoder = MockDecoder::new();
let mut seq = Sequence::new();
decoder decoder
.expect_decode() .expect_decode()
.returning(|word| mock_decoding(word)); .returning(|word| mock_decoding(word));
repo.expect_fetch_many() let batches = vec![
.times(1) // Explicitly expect 1 call Ok(vec![TEST_WORD_1.into()]),
.in_sequence(&mut seq) // Enforce order Err(RepositoryError::ConnectionFailed),
.returning(|_, _| Ok(dict_with_words(&[TEST_WORD_1]))); Ok(vec![TEST_WORD_3.into()]),
];
repo.expect_fetch_many() let stream = stream::iter(batches);
.times(1) // Explicitly expect 1 call let result = LenValueMap::from_stream(stream, &decoder).await;
.in_sequence(&mut seq) // Enforce order
.returning(|_, _| Ok(dict_with_words(&[TEST_WORD_3]))); // word with different decoded length
repo.expect_fetch_many() match result {
.times(1) // We match specifically on the Repository variant and the ConnectionFailed inner error
.in_sequence(&mut seq) Err(LenValueMapError::Repository(RepositoryError::ConnectionFailed)) => {
.returning(|_, _| Ok(Dict::new("default_dict".into()))); // Success! The correct error type propagated up.
}
let data = LenValueMap::build(&decoder, &repo, 1) // If it's any other error (including a stringified one), we fail
.await _ => panic!(
.unwrap() "Expected LenValueMapError::Repository(ConnectionFailed), got {:?}",
.into_data(); result
),
assert_eq!(data.len(), 2); }
} }
// #[tokio::test] // #[tokio::test]
// async fn test_build_fetches_multiple_batches() { // async fn test_build_multiple_batches() {
// let mut repo = MockRepo::new(); // let mut repo = MockRepo::new();
// let mut decoder = MockDecoder::new(); // let mut decoder = MockDecoder::new();
// let mut seq = Sequence::new();
// decoder // decoder
// .expect_decode() // .expect_decode()
// .returning(|word| mock_decoding(word)); // .returning(|word| mock_decoding(word));
// repo.expect_fetch_many() // repo.expect_fetch_many()
// .with(eq("default"), eq(Some(2)), eq(Some(0))) // .times(1) // Explicitly expect 1 call
// .return_once(|_, _, _| Ok(dict_with_words(&[TEST_WORD_1, TEST_WORD_2]))); // .in_sequence(&mut seq) // Enforce order
// .returning(|_, _| Ok(dict_with_words(&[TEST_WORD_1])));
// repo.expect_fetch_many() // repo.expect_fetch_many()
// .with(eq("default"), eq(Some(2)), eq(Some(2))) // .times(1) // Explicitly expect 1 call
// .return_once(|_, _, _| Ok(dict_with_words(&[TEST_WORD_3, TEST_WORD_4]))); // .in_sequence(&mut seq) // Enforce order
// .returning(|_, _| Ok(dict_with_words(&[TEST_WORD_3]))); // word with different decoded length
// repo.expect_fetch_many() // repo.expect_fetch_many()
// .with(eq("default"), eq(Some(2)), eq(Some(4))) // .times(1)
// .return_once(|_, _, _| Ok(Dict::new("default".into()))); // .in_sequence(&mut seq)
// .returning(|_, _| Ok(Dict::new("default_dict".into())));
// let map = LenValueMap::build(&decoder, &repo, 2).await;
// let data = map.into_data(); // let data = LenValueMap::build(&decoder, &repo, 1)
// .await
// .unwrap()
// .into_data();
// assert_eq!(data.len(), 2); // assert_eq!(data.len(), 2);
// } // }

2
lib/src/core/sys_major/rules_pl.rs

@ -133,8 +133,8 @@ pub fn get_rules() -> Rules {
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::*; use super::*;
use crate::SystemDecoder;
use crate::core::sys_major::Decoder; use crate::core::sys_major::Decoder;
use crate::traits::SystemDecoder;
#[test] #[test]
fn test_major_dict_pl_decode_0_1() { fn test_major_dict_pl_decode_0_1() {

18
lib/src/core/system.rs

@ -1,7 +1,7 @@
use serde::Deserialize; use serde::Deserialize;
use crate::core::SystemDecoder; use crate::core::sys_major::{self as major, LenValueMap};
use crate::core::sys_major as major; use crate::core::{DictRepository, SystemDecoder, SystemEncoder};
#[derive(Debug, Clone, Copy, PartialEq, Eq, Deserialize)] #[derive(Debug, Clone, Copy, PartialEq, Eq, Deserialize)]
pub enum System { pub enum System {
@ -28,3 +28,17 @@ pub fn create_decoder(system: &System) -> Box<dyn SystemDecoder> {
System::MajorEn => Box::new(major::Decoder::new(major::rules_en::get_rules())), System::MajorEn => Box::new(major::Decoder::new(major::rules_en::get_rules())),
} }
} }
pub async fn create_encoder(system: &System, dict: &dyn DictRepository) -> Box<dyn SystemEncoder> {
// let decoder = create_decoder(&system);
let decoder = major::Decoder::new(match system {
System::MajorPl => major::rules_pl::get_rules(),
System::MajorEn => major::rules_en::get_rules(),
});
let stream = dict.stream_batches(100).await.unwrap(); // TODO
let lvmap = LenValueMap::from_stream(stream, &decoder).await.unwrap(); // TODO
match system {
System::MajorPl => Box::new(major::Encoder::new(lvmap)),
System::MajorEn => Box::new(major::Encoder::new(lvmap)),
}
}

9
lib/src/core/traits.rs

@ -1,3 +1,5 @@
use futures::stream::BoxStream;
use crate::core::entities::EncodedValue; use crate::core::entities::EncodedValue;
use crate::core::errors::CodecError; use crate::core::errors::CodecError;
@ -26,6 +28,13 @@ pub trait DictRepository: Send + Sync {
/// Fetch a page of entries. /// Fetch a page of entries.
async fn fetch_many(&self, limit: usize, offset: usize) -> Result<Dict, RepositoryError>; async fn fetch_many(&self, limit: usize, offset: usize) -> Result<Dict, RepositoryError>;
/// Returns a cold stream that fetches strings in chunks.
/// The stream yields `Result<Vec<String>, RepositoryError>`.
async fn stream_batches(
&self,
batch_size: usize,
) -> Result<BoxStream<'_, Result<Vec<String>, RepositoryError>>, RepositoryError>;
} }
pub trait DictSource { pub trait DictSource {

41
lib/src/infrastructure/sqlite_dict_repository.rs

@ -1,6 +1,9 @@
use crate::core::entities::{Dict, DictEntry}; use crate::core::entities::{Dict, DictEntry};
use crate::core::errors::RepositoryError; use crate::core::errors::RepositoryError;
use crate::core::traits::DictRepository; use crate::core::traits::DictRepository;
use futures::stream::BoxStream;
use futures::{StreamExt, TryStreamExt};
use sqlx::{Row, SqlitePool, sqlite::SqliteConnectOptions}; use sqlx::{Row, SqlitePool, sqlite::SqliteConnectOptions};
use std::collections::HashMap; use std::collections::HashMap;
use std::str::FromStr; use std::str::FromStr;
@ -187,4 +190,42 @@ impl DictRepository for SqliteDictRepository {
entries: entries_map, entries: entries_map,
}) })
} }
async fn stream_batches(
&self,
batch_size: usize,
) -> Result<BoxStream<'_, Result<Vec<String>, RepositoryError>>, RepositoryError> {
// 1. Resolve ID first
let dict_id = self.get_dict_id().await?;
// 2. Create the base query stream.
// We do NOT use limit/offset. We let the DB stream rows via a cursor.
let query_stream = sqlx::query("SELECT text FROM entries WHERE dictionary_id = ?")
.bind(dict_id)
.fetch(&self.pool);
// 3. Transform the stream using Functional combinators
let stream = query_stream
// Map SQLx errors to Domain errors
.map_err(|e| RepositoryError::StorageError(e.to_string()))
// Extract the String from the Row
.and_then(|row| async move {
// 'text' is the column name
let text: String = row
.try_get("text")
.map_err(|e| RepositoryError::StorageError(e.to_string()))?;
Ok(text)
})
// Group items into vectors of size `batch_size`
.try_chunks(batch_size)
// try_chunks returns a specific error type on failure, map it back
.map_err(|e| {
// logic to handle leftover elements if error occurs,
// but for simplicity, we treat stream errors as fatal here
RepositoryError::StorageError(e.to_string())
});
// 4. Box the stream to erase the complex iterator type (Type Erasure)
Ok(Box::pin(stream))
}
} }

3
lib/src/lib.rs

@ -4,6 +4,7 @@ pub mod infrastructure;
mod presentation; mod presentation;
pub use self::application::config; pub use self::application::config;
pub use self::application::services::DictImporter;
pub use self::core::system; pub use self::core::system;
pub use self::core::traits::SystemDecoder; pub use self::core::traits;
pub use self::presentation::cli; pub use self::presentation::cli;

12
lib/src/presentation/cli/cli_args.rs

@ -30,6 +30,9 @@ pub enum Command {
/// Decode a word using given system /// Decode a word using given system
Decode(DecodeArgs), Decode(DecodeArgs),
/// Encode a number using given system
Encode(EncodeArgs),
/// Import dictionary /// Import dictionary
ImportDict(ImportDictArgs), ImportDict(ImportDictArgs),
} }
@ -42,6 +45,15 @@ pub struct ServerArgs {
#[derive(ClapArgs, Debug, Clone)] #[derive(ClapArgs, Debug, Clone)]
pub struct DecodeArgs { pub struct DecodeArgs {
#[arg(long, help = defaults::HELP_DEC_SYSTEM)]
pub system: Option<String>,
#[arg(long, help = defaults::HELP_DEC_INPUT)]
pub input: String,
}
#[derive(ClapArgs, Debug, Clone)]
pub struct EncodeArgs {
#[arg(long, help = defaults::HELP_ENC_SYSTEM)] #[arg(long, help = defaults::HELP_ENC_SYSTEM)]
pub system: Option<String>, pub system: Option<String>,

1
lib/src/presentation/cli/commands.rs

@ -1,3 +1,4 @@
pub mod decode; pub mod decode;
pub mod encode;
pub mod import_dict; pub mod import_dict;
pub mod server; pub mod server;

2
lib/src/presentation/cli/commands/decode.rs

@ -3,7 +3,7 @@ use crate::core::system;
use tracing::debug; use tracing::debug;
pub async fn run(config: DecoderConfig) { pub async fn run(config: DecoderConfig) {
debug!("Running greeter with config {:?}", config); debug!("Running decoder with config {:?}", config);
let decoder = system::create_decoder(&config.system); let decoder = system::create_decoder(&config.system);
let result = decoder.decode(&config.input).unwrap(); let result = decoder.decode(&config.input).unwrap();
println!("{}", result.as_str()); println!("{}", result.as_str());

10
lib/src/presentation/cli/commands/encode.rs

@ -0,0 +1,10 @@
use crate::application::config::EncoderConfig;
use crate::core::{DictRepository, system};
use tracing::debug;
pub async fn run(config: EncoderConfig, dict: &dyn DictRepository) {
debug!("Running encoder with config {:?}", config);
let encoder = system::create_encoder(&config.system, dict).await;
let result = encoder.encode(&config.input);
println!("{:?}", result);
}

9
lib/src/presentation/cli/commands/import_dict.rs

@ -1,12 +1,8 @@
use crate::application::{config::ImportDictConfig, services::DictImporter}; use crate::application::{config::ImportDictConfig, services::DictImporter};
use crate::core::traits::DictRepository;
use crate::infrastructure::json_file_dict_source::JsonFileDictSource; use crate::infrastructure::json_file_dict_source::JsonFileDictSource;
use tracing::{debug, error, info}; use tracing::{debug, error, info};
pub async fn run<R: DictRepository>( pub async fn run(config: ImportDictConfig, importer: DictImporter) -> Result<(), anyhow::Error> {
config: ImportDictConfig,
repository: R,
) -> Result<(), anyhow::Error> {
debug!("Importing dict with config {:?}", config); debug!("Importing dict with config {:?}", config);
info!( info!(
@ -17,9 +13,6 @@ pub async fn run<R: DictRepository>(
// Create the JSON file source (will auto-generate IDs starting from 1) // Create the JSON file source (will auto-generate IDs starting from 1)
let source = JsonFileDictSource::new(&config.path)?; let source = JsonFileDictSource::new(&config.path)?;
// Create the importer
let importer = DictImporter::new(&repository);
// Perform the import (this will call create() first) // Perform the import (this will call create() first)
match importer.import(source).await { match importer.import(source).await {
Ok(()) => { Ok(()) => {

4
lib/src/presentation/cli/defaults.rs

@ -10,8 +10,10 @@ pub const SYSTEM_NAME: &str = "major_pl";
pub const HELP_PORT: &str = formatcp!("Override Port [default: {}]", PORT); pub const HELP_PORT: &str = formatcp!("Override Port [default: {}]", PORT);
pub const HELP_LOG: &str = formatcp!("Override Log Level [default: {}]", LOG_LEVEL); pub const HELP_LOG: &str = formatcp!("Override Log Level [default: {}]", LOG_LEVEL);
pub const HELP_DEC_SYSTEM: &str = formatcp!("System to use [default: {}]", SYSTEM_NAME);
pub const HELP_DEC_INPUT: &str = formatcp!("Text to decode");
pub const HELP_ENC_SYSTEM: &str = formatcp!("System to use [default: {}]", SYSTEM_NAME); pub const HELP_ENC_SYSTEM: &str = formatcp!("System to use [default: {}]", SYSTEM_NAME);
pub const HELP_ENC_INPUT: &str = formatcp!("Text to decode"); pub const HELP_ENC_INPUT: &str = formatcp!("Number to encode");
pub const HELP_IMPORT_DICT_NAME: &str = formatcp!("Dictionary name"); pub const HELP_IMPORT_DICT_NAME: &str = formatcp!("Dictionary name");
pub const HELP_IMPORT_DICT_INPUT: &str = formatcp!("Dictionary file path"); pub const HELP_IMPORT_DICT_INPUT: &str = formatcp!("Dictionary file path");

Loading…
Cancel
Save