From 239e26b08cd0365ce55c77cf3e2381d15f068ee4 Mon Sep 17 00:00:00 2001 From: chodak166 Date: Fri, 9 Jan 2026 18:41:07 +0100 Subject: [PATCH] WIP: refactor --- app/src/commands/decode.rs | 15 +- app/src/commands/encode.rs | 17 +- app/src/commands/import_dict.rs | 9 +- app/src/commands/server.rs | 23 +- app/src/config.rs | 56 ++- app/src/container.rs | 4 +- lib/migrations/001_initial.sql | 8 - lib/migrations/002_add_updated_at.sql | 6 - lib/src/common.rs | 7 + lib/src/{core => common}/entities.rs | 0 lib/src/{core => common}/errors.rs | 0 lib/src/common/traits.rs | 41 ++ lib/src/dictionary.rs | 6 + lib/src/dictionary/dict_importer.rs | 59 +++ lib/src/dictionary/infrastructure.rs | 2 + .../infrastructure/json_file_dict_source.rs | 85 +++++ .../infrastructure/sqlite_dict_repository.rs | 231 ++++++++++++ lib/src/lib.rs | 28 +- lib/src/presentation/cli.rs | 9 - lib/src/presentation/server.rs | 20 - lib/src/sys_major.rs | 12 + lib/src/sys_major/decoder.rs | 122 ++++++ lib/src/sys_major/decoder_tests.rs | 134 +++++++ lib/src/sys_major/encoder.rs | 179 +++++++++ lib/src/sys_major/lvmap.rs | 351 ++++++++++++++++++ .../removeme.old}/application.rs | 0 .../removeme.old}/application/config.rs | 0 .../removeme.old}/application/errors.rs | 0 .../removeme.old}/application/services.rs | 0 .../removeme.old}/application/traits.rs | 0 lib/src/{ => sys_major/removeme.old}/core.rs | 0 .../sys_major/removeme.old/core/entities.rs | 146 ++++++++ lib/src/sys_major/removeme.old/core/errors.rs | 31 ++ .../removeme.old}/core/sys_major.rs | 0 .../removeme.old}/core/sys_major/decoder.rs | 0 .../core/sys_major/decoder_tests.rs | 0 .../removeme.old}/core/sys_major/encoder.rs | 0 .../removeme.old}/core/sys_major/lvmap.rs | 0 .../removeme.old}/core/sys_major/rules_en.rs | 0 .../removeme.old}/core/sys_major/rules_pl.rs | 0 .../removeme.old}/core/system.rs | 0 .../removeme.old}/core/traits.rs | 0 .../removeme.old}/infrastructure.rs | 0 .../removeme.old}/infrastructure/errors.rs | 0 .../infrastructure/json_file_dict_source.rs | 0 .../infrastructure/sqlite_dict_repository.rs | 0 .../removeme.old}/presentation.rs | 0 lib/src/sys_major/rules_en.rs | 15 + lib/src/sys_major/rules_pl.rs | 222 +++++++++++ 49 files changed, 1767 insertions(+), 71 deletions(-) delete mode 100644 lib/migrations/001_initial.sql delete mode 100644 lib/migrations/002_add_updated_at.sql create mode 100644 lib/src/common.rs rename lib/src/{core => common}/entities.rs (100%) rename lib/src/{core => common}/errors.rs (100%) create mode 100644 lib/src/common/traits.rs create mode 100644 lib/src/dictionary.rs create mode 100644 lib/src/dictionary/dict_importer.rs create mode 100644 lib/src/dictionary/infrastructure.rs create mode 100644 lib/src/dictionary/infrastructure/json_file_dict_source.rs create mode 100644 lib/src/dictionary/infrastructure/sqlite_dict_repository.rs delete mode 100644 lib/src/presentation/cli.rs delete mode 100644 lib/src/presentation/server.rs create mode 100644 lib/src/sys_major.rs create mode 100644 lib/src/sys_major/decoder.rs create mode 100644 lib/src/sys_major/decoder_tests.rs create mode 100644 lib/src/sys_major/encoder.rs create mode 100644 lib/src/sys_major/lvmap.rs rename lib/src/{ => sys_major/removeme.old}/application.rs (100%) rename lib/src/{ => sys_major/removeme.old}/application/config.rs (100%) rename lib/src/{ => sys_major/removeme.old}/application/errors.rs (100%) rename lib/src/{ => sys_major/removeme.old}/application/services.rs (100%) rename lib/src/{ => sys_major/removeme.old}/application/traits.rs (100%) rename lib/src/{ => sys_major/removeme.old}/core.rs (100%) create mode 100644 lib/src/sys_major/removeme.old/core/entities.rs create mode 100644 lib/src/sys_major/removeme.old/core/errors.rs rename lib/src/{ => sys_major/removeme.old}/core/sys_major.rs (100%) rename lib/src/{ => sys_major/removeme.old}/core/sys_major/decoder.rs (100%) rename lib/src/{ => sys_major/removeme.old}/core/sys_major/decoder_tests.rs (100%) rename lib/src/{ => sys_major/removeme.old}/core/sys_major/encoder.rs (100%) rename lib/src/{ => sys_major/removeme.old}/core/sys_major/lvmap.rs (100%) rename lib/src/{ => sys_major/removeme.old}/core/sys_major/rules_en.rs (100%) rename lib/src/{ => sys_major/removeme.old}/core/sys_major/rules_pl.rs (100%) rename lib/src/{ => sys_major/removeme.old}/core/system.rs (100%) rename lib/src/{ => sys_major/removeme.old}/core/traits.rs (100%) rename lib/src/{ => sys_major/removeme.old}/infrastructure.rs (100%) rename lib/src/{ => sys_major/removeme.old}/infrastructure/errors.rs (100%) rename lib/src/{ => sys_major/removeme.old}/infrastructure/json_file_dict_source.rs (100%) rename lib/src/{ => sys_major/removeme.old}/infrastructure/sqlite_dict_repository.rs (100%) rename lib/src/{ => sys_major/removeme.old}/presentation.rs (100%) create mode 100644 lib/src/sys_major/rules_en.rs create mode 100644 lib/src/sys_major/rules_pl.rs diff --git a/app/src/commands/decode.rs b/app/src/commands/decode.rs index aae23b0..cbfd170 100644 --- a/app/src/commands/decode.rs +++ b/app/src/commands/decode.rs @@ -1,15 +1,16 @@ use crate::commands::{ClapArgs, CommandExecutor, ConfigurableCommand}; use crate::config::AppConfig; +use crate::config::System; use crate::container::Container; use anyhow::Result; -use applib::core::sys_major::decoder::Decoder; -use applib::core::sys_major::{self as major}; -use applib::core::traits::SystemDecoder; -use applib::system::System; +use applib::SystemDecoder; +use applib::sys_major::decoder::Decoder; +use applib::sys_major::{self as major}; use async_trait::async_trait; use config::ConfigBuilder; use config::builder::DefaultState; +use serde::Deserialize; mod defaults { use const_format::formatcp; @@ -18,6 +19,12 @@ mod defaults { pub const HELP_DEC_INPUT: &str = formatcp!("Text to decode"); } +#[derive(Debug, Deserialize, Clone)] +pub struct Config { + pub system: System, + pub input: String, +} + #[derive(ClapArgs, Debug, Clone)] pub struct DecodeArgs { #[arg(long, help = defaults::HELP_DEC_SYSTEM)] diff --git a/app/src/commands/encode.rs b/app/src/commands/encode.rs index a5b9f54..5dd8aed 100644 --- a/app/src/commands/encode.rs +++ b/app/src/commands/encode.rs @@ -1,11 +1,12 @@ -use applib::core::sys_major::encoder::Encoder; -use applib::core::sys_major::{self as major, LenValueMap}; -use applib::core::traits::SystemEncoder; -use applib::system::System; +use applib::SystemEncoder; +use applib::sys_major::encoder::Encoder; +use applib::sys_major::{self as major, LenValueMap}; + +use serde::Deserialize; use tracing::debug; use crate::commands::{ClapArgs, CommandExecutor, ConfigurableCommand}; -use crate::config::AppConfig; +use crate::config::{AppConfig, System}; use crate::container::Container; use anyhow::Result; @@ -20,6 +21,12 @@ mod defaults { pub const HELP_ENC_INPUT: &str = formatcp!("Number to encode"); } +#[derive(Debug, Deserialize, Clone)] +pub struct Config { + pub system: System, + pub input: String, +} + #[derive(ClapArgs, Debug, Clone)] pub struct EncodeArgs { #[arg(long, help = defaults::HELP_ENC_SYSTEM)] diff --git a/app/src/commands/import_dict.rs b/app/src/commands/import_dict.rs index 32d1043..2532acf 100644 --- a/app/src/commands/import_dict.rs +++ b/app/src/commands/import_dict.rs @@ -6,6 +6,13 @@ use anyhow::Result; use async_trait::async_trait; use config::ConfigBuilder; use config::builder::DefaultState; +use serde::Deserialize; + +#[derive(Debug, Deserialize, Clone)] +pub struct Config { + pub name: String, + pub path: String, +} #[derive(ClapArgs, Debug, Clone)] pub struct ImportDictArgs { @@ -49,7 +56,7 @@ impl CommandExecutor for ImportDictArgs { // Importer expects an impl DictSource // We need to create a DictSource from the path - use applib::infrastructure::json_file_dict_source::JsonFileDictSource; + use applib::JsonFileDictSource; let source = JsonFileDictSource::new(&config.path)?; importer.import(source).await?; Ok(()) diff --git a/app/src/commands/server.rs b/app/src/commands/server.rs index d561be1..a538b5e 100644 --- a/app/src/commands/server.rs +++ b/app/src/commands/server.rs @@ -7,9 +7,15 @@ use anyhow::Result; use async_trait::async_trait; use config::ConfigBuilder; use config::builder::DefaultState; +use serde::Deserialize; use tokio::signal; use tracing::{info, warn}; +#[derive(Debug, Deserialize, Clone)] +pub struct Config { + pub port: u16, +} + #[derive(ClapArgs, Debug, Clone)] pub struct ServerArgs { #[arg(short, long, help = defaults::HELP_PORT)] @@ -43,7 +49,15 @@ impl ConfigurableCommand for ServerArgs { impl CommandExecutor for ServerArgs { async fn execute(&self, config: &AppConfig, _container: &Container) -> Result<()> { let config = config.server.as_ref().expect("Server config not set"); - applib::presentation::server::run(config.clone(), wait_for_shutdown_signal()).await; + + info!("Running server with config: {:#?}", config); + tokio::select! { + _ = server_loop() => {}, + _ = wait_for_shutdown_signal() => { + info!("Shutting down server..."); + } + } + Ok(()) } } @@ -58,3 +72,10 @@ async fn wait_for_shutdown_signal() { } } } + +async fn server_loop() { + loop { + tokio::time::sleep(tokio::time::Duration::from_secs(1)).await; + info!("Health check... "); + } +} diff --git a/app/src/config.rs b/app/src/config.rs index 34dac32..8feff24 100644 --- a/app/src/config.rs +++ b/app/src/config.rs @@ -1,19 +1,19 @@ -use crate::commands::{ConfigurableCommand, GlobalArgs}; +use crate::commands::*; +// use crate::commands::{ConfigurableCommand, GlobalArgs}; use anyhow::{Context, Result}; -use applib::application::config::*; use config::{Config, Environment, File}; use serde::Deserialize; #[derive(Debug, Deserialize, Clone)] pub struct AppConfig { #[serde(default)] - pub server: Option, + pub server: Option, #[serde(default)] - pub decoder: Option, + pub decoder: Option, #[serde(default)] - pub encoder: Option, + pub encoder: Option, #[serde(default)] - pub import_dict: Option, + pub import_dict: Option, pub log_level: String, } @@ -48,3 +48,47 @@ impl AppConfig { .context("Failed to deserialize Config") } } + +// TODO: move? +use applib::sys_major::{self as major, LenValueMap}; +use applib::{DictRepository, SystemDecoder, SystemEncoder}; + +#[derive(Debug, Clone, Copy, PartialEq, Eq, Deserialize)] +pub enum System { + #[serde(rename = "major_en")] + MajorEn, + #[serde(rename = "major_pl")] + MajorPl, +} + +// from: +impl From<&str> for System { + fn from(s: &str) -> Self { + match s { + "major_en" => System::MajorEn, + "major_pl" => System::MajorPl, + _ => panic!("Unknown system: {}", s), + } + } +} + +pub fn create_decoder(system: &System) -> Box { + match system { + System::MajorPl => Box::new(major::Decoder::new(major::rules_pl::get_rules())), + System::MajorEn => Box::new(major::Decoder::new(major::rules_en::get_rules())), + } +} + +pub async fn create_encoder(system: &System, dict: &dyn DictRepository) -> Box { + // let decoder = create_decoder(&system); + let decoder = major::Decoder::new(match system { + System::MajorPl => major::rules_pl::get_rules(), + System::MajorEn => major::rules_en::get_rules(), + }); + let stream = dict.stream_batches(100).await.unwrap(); // TODO + let lvmap = LenValueMap::from_stream(stream, &decoder).await.unwrap(); // TODO + match system { + System::MajorPl => Box::new(major::Encoder::new(lvmap)), + System::MajorEn => Box::new(major::Encoder::new(lvmap)), + } +} diff --git a/app/src/container.rs b/app/src/container.rs index e53a646..f67bb96 100644 --- a/app/src/container.rs +++ b/app/src/container.rs @@ -2,8 +2,8 @@ use std::sync::Arc; // use crate::config::AppConfig; use applib::DictImporter; -use applib::infrastructure::sqlite_dict_repository::SqliteDictRepository; -use applib::traits::DictRepository; +use applib::DictRepository; +use applib::SqliteDictRepository; #[derive(Clone)] pub struct Container; diff --git a/lib/migrations/001_initial.sql b/lib/migrations/001_initial.sql deleted file mode 100644 index 35eb84c..0000000 --- a/lib/migrations/001_initial.sql +++ /dev/null @@ -1,8 +0,0 @@ --- Create dicts table -CREATE TABLE IF NOT EXISTS dicts ( - name TEXT PRIMARY KEY, - created_at DATETIME DEFAULT CURRENT_TIMESTAMP -); - --- Note: Individual dict entry tables will be created dynamically when dicts are created --- This is because each dict needs its own table with the dict name in the table name \ No newline at end of file diff --git a/lib/migrations/002_add_updated_at.sql b/lib/migrations/002_add_updated_at.sql deleted file mode 100644 index 9d1bce5..0000000 --- a/lib/migrations/002_add_updated_at.sql +++ /dev/null @@ -1,6 +0,0 @@ --- This migration adds the updated_at column to existing dict entry tables --- Since dict entry tables are created dynamically, we need to handle this differently --- For now, we'll drop and recreate existing tables to ensure they have the correct schema - --- Note: In a production environment, you would want to handle this more carefully --- by checking each existing dict_entries_* table and adding the column if it doesn't exist \ No newline at end of file diff --git a/lib/src/common.rs b/lib/src/common.rs new file mode 100644 index 0000000..b7854fc --- /dev/null +++ b/lib/src/common.rs @@ -0,0 +1,7 @@ +pub mod entities; +pub mod errors; +pub mod traits; + +pub use self::traits::DictRepository; +pub use self::traits::SystemDecoder; +pub use self::traits::SystemEncoder; diff --git a/lib/src/core/entities.rs b/lib/src/common/entities.rs similarity index 100% rename from lib/src/core/entities.rs rename to lib/src/common/entities.rs diff --git a/lib/src/core/errors.rs b/lib/src/common/errors.rs similarity index 100% rename from lib/src/core/errors.rs rename to lib/src/common/errors.rs diff --git a/lib/src/common/traits.rs b/lib/src/common/traits.rs new file mode 100644 index 0000000..5322dee --- /dev/null +++ b/lib/src/common/traits.rs @@ -0,0 +1,41 @@ +use futures::stream::BoxStream; + +use crate::common::{ + entities::{DecodedValue, Dict, DictEntry, EncodedValue}, + errors::{CodecError, RepositoryError}, +}; + +pub trait SystemDecoder: Send + Sync { + fn decode(&self, word: &str) -> Result; +} + +pub trait SystemEncoder: Send + Sync { + fn initialize(&self) -> Result<(), CodecError>; + fn encode(&self, word: &str) -> Result; +} + +#[async_trait::async_trait] +pub trait DictRepository: Send + Sync { + fn use_dict(&mut self, name: &str); + async fn create_dict(&self) -> Result<(), RepositoryError>; + + /// "Upsert" logic: + /// - If entry exists (by text), update metadata. + /// - If not, insert new. + /// - IDs are handled by the Database. + async fn save_entries(&self, entries: &[DictEntry]) -> Result<(), RepositoryError>; + + /// Fetch a page of entries. + async fn fetch_many(&self, limit: usize, offset: usize) -> Result; + + /// Returns a cold stream that fetches strings in chunks. + /// The stream yields `Result, RepositoryError>`. + async fn stream_batches( + &self, + batch_size: usize, + ) -> Result, RepositoryError>>, RepositoryError>; +} + +pub trait DictSource { + fn next_entry(&mut self) -> Option>; +} diff --git a/lib/src/dictionary.rs b/lib/src/dictionary.rs new file mode 100644 index 0000000..fedb896 --- /dev/null +++ b/lib/src/dictionary.rs @@ -0,0 +1,6 @@ +mod dict_importer; +mod infrastructure; + +pub use self::dict_importer::DictImporter; +pub use self::infrastructure::json_file_dict_source::JsonFileDictSource; +pub use self::infrastructure::sqlite_dict_repository::SqliteDictRepository; diff --git a/lib/src/dictionary/dict_importer.rs b/lib/src/dictionary/dict_importer.rs new file mode 100644 index 0000000..5bec64b --- /dev/null +++ b/lib/src/dictionary/dict_importer.rs @@ -0,0 +1,59 @@ +use std::sync::Arc; + +use crate::common::traits::{DictRepository, DictSource}; + +pub struct DictImporter { + repo: Arc, + batch_size: usize, +} + +impl DictImporter { + pub fn new(repo: Arc) -> Self { + Self { + repo, + batch_size: 1000, // reasonable default + } + } + + pub fn with_batch_size(mut self, batch_size: usize) -> Self { + self.batch_size = batch_size; + self + } + + pub async fn import(&self, mut source: impl DictSource) -> Result<(), anyhow::Error> { + // 1. Ensure Dict exists (Logic: Create if new, or maybe clear existing?) + self.repo.create_dict().await?; + + let mut batch = Vec::with_capacity(self.batch_size); + + // 2. Stream data + while let Some(result) = source.next_entry() { + match result { + Ok(entry) => { + // Optional: Domain Validation logic could go here + // if entry.text.is_empty() { continue; } + + batch.push(entry); + + // 3. Batch Write + if batch.len() >= self.batch_size { + self.repo.save_entries(&batch).await?; + batch.clear(); + } + } + Err(e) => { + // Logic: Do we abort on malformed JSON or log and continue? + // Here we abort for safety. + return Err(e); + } + } + } + + // 4. Flush remaining + if !batch.is_empty() { + self.repo.save_entries(&batch).await?; + } + + Ok(()) + } +} diff --git a/lib/src/dictionary/infrastructure.rs b/lib/src/dictionary/infrastructure.rs new file mode 100644 index 0000000..cbff865 --- /dev/null +++ b/lib/src/dictionary/infrastructure.rs @@ -0,0 +1,2 @@ +pub mod json_file_dict_source; +pub mod sqlite_dict_repository; diff --git a/lib/src/dictionary/infrastructure/json_file_dict_source.rs b/lib/src/dictionary/infrastructure/json_file_dict_source.rs new file mode 100644 index 0000000..b7b3dcd --- /dev/null +++ b/lib/src/dictionary/infrastructure/json_file_dict_source.rs @@ -0,0 +1,85 @@ +use crate::common::entities::DictEntry; +use crate::common::traits::DictSource; +use serde::Deserialize; +use std::collections::HashMap; +use std::fs::File; +use std::io::BufReader; +use std::path::Path; + +// The "Wire Format". +// It exists ONLY here to map external JSON names to internal Entity names. +#[derive(Deserialize)] +struct JsonEntry { + word: String, + metadata: Option>, +} + +pub struct JsonFileDictSource { + entries: Vec, + current_index: usize, + next_id: u32, +} + +impl JsonFileDictSource { + pub fn new>(path: P) -> anyhow::Result { + let file = File::open(path)?; + let reader = BufReader::new(file); + + // Parse as JSON array + let json_entries: Vec = serde_json::from_reader(reader)?; + + // Convert to DictEntry with auto-generated IDs + let mut entries = Vec::new(); + for (index, json_entry) in json_entries.into_iter().enumerate() { + let id = (index + 1) as u64; // Auto-generate ID starting from 1 + + // Convert metadata from serde_json::Value to HashMap + let metadata = if let Some(meta) = json_entry.metadata { + meta.into_iter() + .map(|(k, v)| { + ( + k, + match v { + serde_json::Value::String(s) => s, + _ => v.to_string(), + }, + ) + }) + .collect() + } else { + HashMap::new() + }; + + entries.push(DictEntry { + id: Some(id), + text: json_entry.word, + metadata, + }); + } + + let entries_len = entries.len(); + Ok(Self { + entries, + current_index: 0, + next_id: (entries_len + 1) as u32, + }) + } + + pub fn new_with_existing_ids>(path: P, start_id: u32) -> anyhow::Result { + let mut source = Self::new(path)?; + source.next_id = start_id; + Ok(source) + } +} + +impl DictSource for JsonFileDictSource { + fn next_entry(&mut self) -> Option> { + if self.current_index < self.entries.len() { + let entry = self.entries[self.current_index].clone(); + self.current_index += 1; + Some(Ok(entry)) + } else { + None + } + } +} diff --git a/lib/src/dictionary/infrastructure/sqlite_dict_repository.rs b/lib/src/dictionary/infrastructure/sqlite_dict_repository.rs new file mode 100644 index 0000000..48f0e93 --- /dev/null +++ b/lib/src/dictionary/infrastructure/sqlite_dict_repository.rs @@ -0,0 +1,231 @@ +use crate::common::entities::{Dict, DictEntry}; +use crate::common::errors::RepositoryError; +use crate::common::traits::DictRepository; +use futures::TryStreamExt; +use futures::stream::BoxStream; + +use sqlx::{Row, SqlitePool, sqlite::SqliteConnectOptions}; +use std::collections::HashMap; +use std::str::FromStr; + +#[derive(sqlx::FromRow)] +struct SqliteEntryDto { + id: i64, + text: String, + // sqlx reads the DB column into this specific wrapper + metadata: sqlx::types::Json>, +} + +// Mapper: DTO -> Domain Entity +impl From for DictEntry { + fn from(dto: SqliteEntryDto) -> Self { + Self { + id: Some(dto.id as u64), + text: dto.text, + // Unwrap the sqlx wrapper to get the inner HashMap + metadata: dto.metadata.0, + } + } +} + +// --- REPOSITORY IMPLEMENTATION --- + +#[derive(Clone)] +pub struct SqliteDictRepository { + pool: SqlitePool, + dict_name: String, +} + +impl SqliteDictRepository { + pub async fn new(database_url: &str) -> Result { + let options = SqliteConnectOptions::from_str(database_url) + .map_err(|_| RepositoryError::ConnectionFailed)? + .create_if_missing(true); + + let pool = SqlitePool::connect_with(options) + .await + .map_err(|_| RepositoryError::ConnectionFailed)?; + + // Ensure tables exist with proper Normalization and Constraints + sqlx::query( + r#" + CREATE TABLE IF NOT EXISTS dictionaries ( + id INTEGER PRIMARY KEY, + name TEXT NOT NULL UNIQUE, + created_at DATETIME DEFAULT CURRENT_TIMESTAMP + ); + + CREATE TABLE IF NOT EXISTS entries ( + id INTEGER PRIMARY KEY, + dictionary_id INTEGER NOT NULL, + text TEXT NOT NULL, + metadata TEXT, + updated_at DATETIME DEFAULT CURRENT_TIMESTAMP, + FOREIGN KEY(dictionary_id) REFERENCES dictionaries(id) ON DELETE CASCADE, + -- This constraint allows us to update existing words instead of duplicating them + UNIQUE(dictionary_id, text) + ); + "#, + ) + .execute(&pool) + .await + .map_err(|e| RepositoryError::StorageError(e.to_string()))?; + + Ok(Self { + pool: pool, + dict_name: "default_dict".into(), + }) + } + + // Helper: Resolve dictionary name to ID + async fn get_dict_id(&self) -> Result { + let row = sqlx::query("SELECT id FROM dictionaries WHERE name = ?") + .bind(&self.dict_name) + .fetch_optional(&self.pool) + .await + .map_err(|e| RepositoryError::StorageError(e.to_string()))?; + + match row { + Some(r) => Ok(r.get("id")), + None => Err(RepositoryError::NotFound(self.dict_name.clone())), + } + } +} + +#[async_trait::async_trait] +impl DictRepository for SqliteDictRepository { + async fn create_dict(&self) -> Result<(), RepositoryError> { + sqlx::query("INSERT OR IGNORE INTO dictionaries (name) VALUES (?)") + .bind(&self.dict_name) + .execute(&self.pool) + .await + .map_err(|e| RepositoryError::StorageError(e.to_string()))?; + Ok(()) + } + + fn use_dict(&mut self, name: &str) { + self.dict_name = name.to_string(); + } + + async fn save_entries(&self, entries: &[DictEntry]) -> Result<(), RepositoryError> { + let mut tx = self + .pool + .begin() + .await + .map_err(|_| RepositoryError::ConnectionFailed)?; + + // 1. Get Dict ID + let dict_id_row = sqlx::query("SELECT id FROM dictionaries WHERE name = ?") + .bind(&self.dict_name) + .fetch_optional(&mut *tx) + .await + .map_err(|e| RepositoryError::StorageError(e.to_string()))?; + + let dict_id: i64 = match dict_id_row { + Some(row) => row.get("id"), + None => return Err(RepositoryError::NotFound(self.dict_name.clone())), + }; + + // 2. Batch Upsert + for entry in entries { + // We must wrap the HashMap in sqlx::types::Json so SQLx knows how to serialize it + let meta_json = sqlx::types::Json(&entry.metadata); + + sqlx::query( + r#" + INSERT INTO entries (dictionary_id, text, metadata) + VALUES (?, ?, ?) + ON CONFLICT(dictionary_id, text) DO UPDATE SET + metadata = excluded.metadata, + updated_at = CURRENT_TIMESTAMP + "#, + ) + .bind(dict_id) + .bind(&entry.text) + .bind(meta_json) + .execute(&mut *tx) + .await + .map_err(|e| RepositoryError::StorageError(e.to_string()))?; + } + + tx.commit() + .await + .map_err(|e| RepositoryError::StorageError(e.to_string()))?; + Ok(()) + } + + async fn fetch_many(&self, limit: usize, offset: usize) -> Result { + // Get Dict ID + let dict_id = self.get_dict_id().await?; + + // Query (Reading into the DTO) + let dtos = sqlx::query_as::<_, SqliteEntryDto>( + r#" + SELECT id, text, metadata + FROM entries + WHERE dictionary_id = ? + LIMIT ? OFFSET ? + "#, + ) + .bind(dict_id) + .bind(limit as u32) + .bind(offset as u32) + .fetch_all(&self.pool) + .await + .map_err(|e| RepositoryError::StorageError(e.to_string()))?; + + // 4. Convert DTOs to Domain Dict + let mut entries_map = HashMap::new(); + for dto in dtos { + let entry: DictEntry = dto.into(); // Converts DTO -> Entity + + // We safely unwrap because the DB guarantees an ID exists + if let Some(id) = entry.id { + entries_map.insert(id, entry); + } + } + + Ok(Dict { + name: self.dict_name.clone(), + entries: entries_map, + }) + } + + async fn stream_batches( + &self, + batch_size: usize, + ) -> Result, RepositoryError>>, RepositoryError> { + // 1. Resolve ID first + let dict_id = self.get_dict_id().await?; + + // 2. Create the base query stream. + // We do NOT use limit/offset. We let the DB stream rows via a cursor. + let query_stream = sqlx::query("SELECT text FROM entries WHERE dictionary_id = ?") + .bind(dict_id) + .fetch(&self.pool); + + // 3. Transform the stream using Functional combinators + let stream = query_stream + // Map SQLx errors to Domain errors + .map_err(|e| RepositoryError::StorageError(e.to_string())) + // Extract the String from the Row + .and_then(|row| async move { + // 'text' is the column name + let text: String = row + .try_get("text") + .map_err(|e| RepositoryError::StorageError(e.to_string()))?; + Ok(text) + }) + // Group items into vectors of size `batch_size` + .try_chunks(batch_size) + // try_chunks returns a specific error type on failure, map it back + .map_err(|e| { + // logic to handle leftover elements if error occurs, + // but for simplicity, we treat stream errors as fatal here + RepositoryError::StorageError(e.to_string()) + }); + + // 4. Box the stream to erase the complex iterator type (Type Erasure) + Ok(Box::pin(stream)) + } +} diff --git a/lib/src/lib.rs b/lib/src/lib.rs index 6f98201..33fc17d 100644 --- a/lib/src/lib.rs +++ b/lib/src/lib.rs @@ -1,10 +1,20 @@ -pub mod application; -pub mod core; -pub mod infrastructure; -pub mod presentation; +// pub mod application; +// pub mod core; +// pub mod infrastructure; +// pub mod presentation; -pub use self::application::config; -pub use self::application::services::DictImporter; -pub use self::core::system; -pub use self::core::traits; -// pub use self::presentation::cli; // Removed as we are deleting it +// pub use self::application::config; +// pub use self::application::services::DictImporter; +// pub use self::core::system; +// pub use self::core::traits; + +mod common; +mod dictionary; +pub mod sys_major; + +pub use self::common::DictRepository; +pub use self::common::SystemDecoder; +pub use self::common::SystemEncoder; +pub use self::dictionary::DictImporter; +pub use self::dictionary::JsonFileDictSource; +pub use self::dictionary::SqliteDictRepository; diff --git a/lib/src/presentation/cli.rs b/lib/src/presentation/cli.rs deleted file mode 100644 index f89bbcd..0000000 --- a/lib/src/presentation/cli.rs +++ /dev/null @@ -1,9 +0,0 @@ -pub mod cli_args; -pub mod commands; -pub mod defaults; -pub mod traits; - -pub use self::cli_args::{ - CliArgs, Command, DecodeArgs, EncodeArgs, GlobalArgs, ImportDictArgs, ServerArgs, -}; -pub use self::traits::ConfigurableCommand; diff --git a/lib/src/presentation/server.rs b/lib/src/presentation/server.rs deleted file mode 100644 index 7878a16..0000000 --- a/lib/src/presentation/server.rs +++ /dev/null @@ -1,20 +0,0 @@ -use crate::application::config::ServerConfig; -use tracing::info; - -pub async fn run(config: ServerConfig, blocker: impl std::future::Future) { - info!("Running server with config: {:#?}", config); - - tokio::select! { - _ = server_loop() => {}, - _ = blocker => { - info!("Shutting down server..."); - } - } -} - -async fn server_loop() { - loop { - tokio::time::sleep(tokio::time::Duration::from_secs(1)).await; - info!("Health check... "); - } -} diff --git a/lib/src/sys_major.rs b/lib/src/sys_major.rs new file mode 100644 index 0000000..b2df791 --- /dev/null +++ b/lib/src/sys_major.rs @@ -0,0 +1,12 @@ +pub mod decoder; +pub mod encoder; +pub mod lvmap; +pub mod rules_en; // TODO: pub? +pub mod rules_pl; // TODO: pub? + +pub use self::decoder::Decoder; +pub use self::encoder::Encoder; +pub use self::lvmap::LenValueMap; // TODO: pub? + +#[cfg(test)] +mod decoder_tests; diff --git a/lib/src/sys_major/decoder.rs b/lib/src/sys_major/decoder.rs new file mode 100644 index 0000000..2324979 --- /dev/null +++ b/lib/src/sys_major/decoder.rs @@ -0,0 +1,122 @@ +use crate::common::{entities::DecodedValue, errors::CodecError, traits::SystemDecoder}; + +#[derive(Debug, Default, Clone)] +pub struct Rule { + pub phoneme_in: String, + pub phoneme_out: String, + + pub not_before: Vec, + pub not_after: Vec, + + pub only_before: Vec, + pub only_after: Vec, +} + +impl Rule { + pub fn into_lowercase(self) -> Self { + Rule { + phoneme_in: self.phoneme_in.to_lowercase(), + phoneme_out: self.phoneme_out.to_lowercase(), + not_before: Self::lower_vec(self.not_before), + not_after: Self::lower_vec(self.not_after), + only_before: Self::lower_vec(self.only_before), + only_after: Self::lower_vec(self.only_after), + } + } + + fn lower_vec(vec: Vec) -> Vec { + vec.into_iter().map(|s| s.to_lowercase()).collect() + } +} + +pub type Rules = Vec; +// pub struct rules { +// name: String, +// entries: Rules, +// } + +/// (index, decoded value) +type RuleMatches = Vec<(usize, String)>; + +pub struct Decoder { + rules: Rules, +} + +impl Decoder { + pub fn new(rules: Rules) -> Self { + Decoder { + rules: Decoder::to_lower_rules(rules), + } + } + + fn to_lower_rules(rules: Rules) -> Rules { + rules + .into_iter() + .map(|entry| entry.into_lowercase()) + .collect() + } + + fn match_entry(&self, entry: &Rule, word: &str) -> RuleMatches { + word.match_indices(&entry.phoneme_in) + .filter(|(index, _)| self.is_context_matched(&entry, &word, *index)) + .map(|(index, _)| (index, entry.phoneme_out.clone())) + .collect() + } + + fn is_context_matched(&self, entry: &Rule, word: &str, index: usize) -> bool { + let before_context = &word[..index]; + let after_context = &word[index + entry.phoneme_in.len()..]; + // dbg!(&before_context); + // dbg!(&after_context); + + if entry + .not_after + .iter() + .any(|prefix| before_context.ends_with(prefix)) + { + return false; + } + + if entry + .not_before + .iter() + .any(|suffix| after_context.starts_with(suffix)) + { + return false; + } + + if !entry.only_after.is_empty() + && entry + .only_after + .iter() + .all(|prefix| !before_context.ends_with(prefix)) + { + return false; + } + + if !entry.only_before.is_empty() + && entry + .only_before + .iter() + .all(|suffix| !after_context.starts_with(suffix)) + { + return false; + } + + true + } +} + +impl SystemDecoder for Decoder { + fn decode(&self, word: &str) -> Result { + let mut matches: RuleMatches = self + .rules + .iter() + .flat_map(|entry| self.match_entry(&entry, &word.to_lowercase())) + .collect(); + + matches.sort_by_key(|&(pos, _)| pos); + let num_str: String = matches.into_iter().map(|(_, value)| value).collect(); + DecodedValue::new(num_str) + } +} diff --git a/lib/src/sys_major/decoder_tests.rs b/lib/src/sys_major/decoder_tests.rs new file mode 100644 index 0000000..debc436 --- /dev/null +++ b/lib/src/sys_major/decoder_tests.rs @@ -0,0 +1,134 @@ +use super::decoder::{Decoder, Rule, Rules}; +use crate::common::traits::SystemDecoder; + +#[cfg(test)] +mod tests { + use super::*; + + fn create_single_rules() -> Rules { + vec![Rule { + phoneme_in: "B".to_string(), + phoneme_out: "2".to_string(), + not_after: vec!["Y".to_string()], + not_before: vec!["X".to_string()], + only_after: vec!["A".to_string()], + only_before: vec!["C".to_string()], + }] + } + + fn create_single_rules_min() -> Rules { + vec![Rule { + phoneme_in: "B".to_string(), + phoneme_out: "2".to_string(), + ..Default::default() + }] + } + + fn create_double_rules() -> Rules { + vec![ + Rule { + phoneme_in: "CD".to_string(), + phoneme_out: "2".to_string(), + not_after: vec!["00".to_string(), "YZ".to_string()], + not_before: vec!["11".to_string(), "WX".to_string()], + only_after: vec!["22".to_string(), "AB".to_string()], + only_before: vec!["33".to_string(), "EF".to_string()], + }, + Rule { + phoneme_in: "MN".to_string(), + phoneme_out: "3".to_string(), + ..Default::default() + }, + ] + } + + #[test] + fn test_single_symbol_encoding_only_before_only_after_matched() { + let decoder = Decoder::new(create_single_rules()); + let output = decoder.decode("ABC").unwrap(); + assert_eq!(output, "2") + } + + #[test] + fn test_double_symbol_encoding_only_before_only_after_matched() { + let decoder = Decoder::new(create_double_rules()); + let output = decoder.decode("ABCDEF").unwrap(); + assert_eq!(output, "2") + } + + #[test] + fn test_single_symbol_encoding_only_before_not_matched_with_other() { + let decoder = Decoder::new(create_single_rules()); + let output = decoder.decode("DBC").unwrap(); + assert_eq!(output, "") + } + + #[test] + fn test_double_symbol_encoding_only_before_not_matched_with_other() { + let decoder = Decoder::new(create_double_rules()); + let output = decoder.decode("AACDEE").unwrap(); + assert_eq!(output, "") + } + + #[test] + fn test_case_insensitivity() { + let decoder = Decoder::new(create_double_rules()); + let output = decoder.decode("abcdef").unwrap(); + assert_eq!(output, "2") + } + + #[test] + fn test_single_symbol_encoding_only_before_not_matched_with_empty() { + let decoder = Decoder::new(create_single_rules()); + let output = decoder.decode("BC").unwrap(); + assert_eq!(output, "") + } + + #[test] + fn test_single_symbol_encoding_only_before_not_matched_with_not_before() { + let decoder = Decoder::new(create_single_rules()); + let output = decoder.decode("XBC").unwrap(); + assert_eq!(output, "") + } + + #[test] + fn test_single_symbol_encoding_only_after_not_matched_with_other() { + let decoder = Decoder::new(create_single_rules()); + let output = decoder.decode("ABD").unwrap(); + assert_eq!(output, "") + } + + #[test] + fn test_single_symbol_encoding_only_after_not_matched_with_empty() { + let decoder = Decoder::new(create_single_rules()); + let output = decoder.decode("AB").unwrap(); + assert_eq!(output, "") + } + + #[test] + fn test_single_symbol_encoding_only_after_not_matched_with_not_after() { + let decoder = Decoder::new(create_single_rules()); + let output = decoder.decode("ABY").unwrap(); + assert_eq!(output, "") + } + + #[test] + fn test_single_symbol_encoding_empty_before_after_matched_with_empty() { + let decoder = Decoder::new(create_single_rules_min()); + let output = decoder.decode("B").unwrap(); + assert_eq!(output, "2") + } + + #[test] + fn test_single_symbol_encoding_empty_before_after_matched_with_others() { + let decoder = Decoder::new(create_single_rules_min()); + let output = decoder.decode("AXBYC").unwrap(); + assert_eq!(output, "2") + } + #[test] + fn test_encoding_multiple_phonemes() { + let decoder = Decoder::new(create_double_rules()); + let output = decoder.decode("VvmNabCd33mn00CD22cdefmn").unwrap(); + assert_eq!(output, "32323") + } +} diff --git a/lib/src/sys_major/encoder.rs b/lib/src/sys_major/encoder.rs new file mode 100644 index 0000000..c90d537 --- /dev/null +++ b/lib/src/sys_major/encoder.rs @@ -0,0 +1,179 @@ +use crate::common::{ + entities::{EncodedPart, EncodedSplit, EncodedValue}, + errors::CodecError, + traits::*, +}; + +use super::lvmap::LenValueMap; + +#[derive(Debug)] +pub struct Encoder { + lv_map: LenValueMap, +} + +impl Encoder { + pub fn new(lv_map: LenValueMap) -> Self { + Encoder { lv_map } + } +} + +impl SystemEncoder for Encoder { + fn initialize(&self) -> Result<(), CodecError> { + Ok(()) + } + + fn encode(&self, input: &str) -> Result { + let size = input.chars().count(); + let max_mask: usize = (1 << (size - 1)) - 1; + + let indices: Vec = input.char_indices().map(|(i, _)| i).collect(); + let mut results = Vec::with_capacity(max_mask); + + for mask in 0..=max_mask { + let mut parts: Vec = Vec::new(); + let mut last_split = input.char_indices().count(); // we go from right to left to start with the longest parts + + // Iterate through the mask bits to find where to split + for i in 0..size - 1 { + // Check if the i-th bit is set + if (mask >> i) & 1 == 1 { + // The split corresponds to the byte index of the (i+1)-th character + let split_idx = indices[indices.len() - i - 1]; + parts.push(input[split_idx..last_split].to_string()); + last_split = split_idx; + } + } + // Push the remaining part of the string + parts.push(input[..last_split].to_string()); + + let mut all_matched = true; + let mut split = EncodedSplit::new(); + parts.reverse(); + + for part in &parts { + let Ok(num_part) = part.parse::() else { + all_matched = false; + break; + }; + let Some(words) = self.lv_map.get(part.len() as u8, num_part) else { + all_matched = false; + break; + }; + split.push(EncodedPart { + value: num_part, + words: words.clone(), + }); + } + + if all_matched { + results.push(Partition { + value: split, + // To find the "most equal" size, we minimize the sum of squared lengths. + // (This mathematically minimizes variance without needing floating point math) + sum_sq_len: parts.iter().map(|p| p.chars().count().pow(2)).sum(), + }); + } + + // Calculate metrics for sorting + // let num_parts = parts.len(); + + // // To find the "most equal" size, we minimize the sum of squared lengths. + // // (This mathematically minimizes variance without needing floating point math) + // let sum_sq_len: usize = parts.iter().map(|p| p.chars().count().pow(2)).sum(); + + // if let Some(words) = self.lv_map.get(size as u8, input.parse().unwrap()) { + // results.push(Partition { + // parts: words.clone(), + // sum_sq_len, + // }); + // } + } + + // Ok(EncodedValue::new(words)) + // Sort by: + // 1. Fewer parts first (1 part, then 2 parts...) + // 2. Most equal lengths (lower sum of squared lengths is more balanced) + // 3. TODO: Lexicographically (for deterministic stability)? + results.sort_by(|a, b| { + a.value + .len() + .cmp(&b.value.len()) + .then(a.sum_sq_len.cmp(&b.sum_sq_len)) + }); + + // Extract just the strings + let split_results = results.into_iter().map(|p| p.value).collect(); + Ok(EncodedValue::new(split_results)) + } +} + +// A helper struct to keep the split variant and its sort metrics together +struct Partition { + value: EncodedSplit, + sum_sq_len: usize, +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_encode_as_single_length_result() { + let mut lvmap = LenValueMap::new(); + lvmap.push(3, 123, "test_123"); + lvmap.push(3, 345, "test_345_1"); + lvmap.push(3, 345, "test_345_2"); + lvmap.push(3, 678, "test_678"); + let encoder = Encoder::new(lvmap); + let result = encoder.encode("345").unwrap(); + + assert_eq!(result.len(), 1); // single split + assert_eq!(result[0].len(), 1); // single part + assert_eq!(result[0][0].value, 345); + assert_eq!(result[0][0].words.len(), 2); // two words + assert_eq!(result[0][0].words[0], "test_345_1"); + assert_eq!(result[0][0].words[1], "test_345_2"); + } + + #[test] + fn test_encode_as_all_lengths() { + let mut lvmap = LenValueMap::new(); + lvmap.push(1, 0, "test_0"); + lvmap.push(1, 9, "test_9"); + lvmap.push(1, 8, "test_8"); + lvmap.push(1, 7, "test_7"); + lvmap.push(2, 98, "test_98"); + lvmap.push(2, 87, "test_87"); + lvmap.push(3, 987, "test_987"); + lvmap.push(3, 876, "test_876"); + + let encoder = Encoder::new(lvmap); + let result = encoder.encode("987").unwrap(); + + assert_eq!(result.len(), 4); // 987, 98|7, 9|87, 9|8|7 + assert_eq!(result[0].len(), 1); // 987 + + assert_eq!(result[0][0].words.len(), 1); + assert_eq!(result[0][0].words[0], "test_987"); + + assert_eq!(result[1].len(), 2); // 98|7 + assert_eq!(result[1][0].words.len(), 1); + assert_eq!(result[1][0].words[0], "test_98"); + assert_eq!(result[1][1].words.len(), 1); + assert_eq!(result[1][1].words[0], "test_7"); + + assert_eq!(result[2].len(), 2); // 9|87 + assert_eq!(result[2][0].words.len(), 1); + assert_eq!(result[2][0].words[0], "test_9"); + assert_eq!(result[2][1].words.len(), 1); + assert_eq!(result[2][1].words[0], "test_87"); + + assert_eq!(result[3].len(), 3); // 9|8|7 + assert_eq!(result[3][0].words.len(), 1); + assert_eq!(result[3][0].words[0], "test_9"); + assert_eq!(result[3][1].words.len(), 1); + assert_eq!(result[3][1].words[0], "test_8"); + assert_eq!(result[3][2].words.len(), 1); + assert_eq!(result[3][2].words[0], "test_7"); + } +} diff --git a/lib/src/sys_major/lvmap.rs b/lib/src/sys_major/lvmap.rs new file mode 100644 index 0000000..bb68fb2 --- /dev/null +++ b/lib/src/sys_major/lvmap.rs @@ -0,0 +1,351 @@ +use crate::common::{ + SystemDecoder, + entities::DecodedLength, + errors::{CodecError, RepositoryError}, +}; +use futures::{Stream, StreamExt}; +use std::{collections::HashMap, num::ParseIntError}; +use thiserror::Error; + +// We store words by encoded number length, then encoded value +// Example: +// root: +// - 3: +// - 750: +// - word: klasa +// - word: gilza +// - 849: +// - word: farba +// - 2: +// - 45: +// - word: oral + +#[derive(Error, Debug)] +pub enum LenValueMapError { + #[error("value parsing error: {0}")] + Parse(#[from] ParseIntError), + + #[error(transparent)] + Codec(#[from] CodecError), + + #[error(transparent)] + Repository(#[from] RepositoryError), + + #[error("unable to build encoder data: {0}")] + Build(String), +} + +type DecodedNumber = u64; +pub type LenValueData = HashMap>>; + +#[derive(Debug, Default, Clone)] +pub struct LenValueMap { + data: LenValueData, +} + +impl LenValueMap { + pub fn new() -> Self { + Self::default() + } + + pub fn is_empty(&self) -> bool { + self.data.is_empty() + } + + pub fn into_data(self) -> LenValueData { + self.data + } + + pub fn push(&mut self, len: u8, num: DecodedNumber, word: impl Into) -> &mut Self { + self.data + .entry(DecodedLength::from(len)) + .or_insert_with(HashMap::new) + .entry(num) + .or_insert_with(Vec::new) + .push(word.into()); + self + } + + pub fn get(&self, len: u8, num: DecodedNumber) -> Option<&Vec> { + self.data.get(&DecodedLength::from(len))?.get(&num) + } + + pub fn insert_words( + &mut self, + words: I, + decoder: &dyn SystemDecoder, + ) -> Result<(), LenValueMapError> + where + I: IntoIterator, + { + for word in words { + if word.is_empty() { + continue; + } + let decoded = decoder.decode(&word)?; + if decoded.is_empty() { + continue; + } + + self.data + .entry(decoded.value_len()?) + .or_default() + .entry(decoded.parse()?) + .or_default() + .push(word); + } + Ok(()) + } + + pub fn from_data(data: LenValueData) -> Self { + Self { data: data } + } + + pub async fn from_stream( + stream: S, + decoder: &dyn SystemDecoder, + ) -> Result + where + // S is a stream of "Result, Error>" + S: Stream, RepositoryError>>, + { + let mut map = LenValueMap::new(); + let mut stream = Box::pin(stream); + + // We stream the batches one by one. + // This ensures only one batch is in memory at a time. + while let Some(batch_result) = stream.next().await { + match batch_result { + Ok(batch) => { + // We delegate to the synchronous logic for the heavy lifting + map.insert_words(batch, decoder)?; + } + Err(e) => { + // Convert RepositoryError to LenValueMapError::Build + return Err(e.into()); + } + } + } + + Ok(map) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::common::{entities::*, errors::*}; + use futures::stream; + + use std::collections::HashMap; + + use mockall::{mock, predicate::*}; + + const TEST_WORD_1: &str = "test_word_1"; + const TEST_WORD_2: &str = "test_word_2"; + const TEST_WORD_3: &str = "test_word_3"; + const TEST_WORD_4: &str = "test_word_4"; + const TEST_NUM_1: u64 = 12; + const TEST_NUM_2: u64 = 34; + const TEST_NUM_3: u64 = 9876; + const TEST_NUM_1_LEN: DecodedLength = DecodedLength::from(2); + const TEST_NUM_3_LEN: DecodedLength = DecodedLength::from(4); + + fn decoded_value(n: u64) -> DecodedValue { + DecodedValue::new(n.to_string()).unwrap() + } + + fn get_test_dec_map() -> HashMap { + HashMap::from([ + (TEST_WORD_1.to_string(), decoded_value(TEST_NUM_1)), + (TEST_WORD_2.to_string(), decoded_value(TEST_NUM_2)), + (TEST_WORD_3.to_string(), decoded_value(TEST_NUM_3)), + (TEST_WORD_4.to_string(), decoded_value(TEST_NUM_3)), + ]) + } + + fn mock_decoding(word: &str) -> Result { + get_test_dec_map() + .remove(word) + .ok_or_else(|| CodecError::UnexpectedError("".to_string())) + } + + fn get_test_words() -> Vec { + vec![ + TEST_WORD_1.to_string(), + TEST_WORD_2.to_string(), + TEST_WORD_3.to_string(), + TEST_WORD_4.to_string(), + ] + } + + mock! { + pub Decoder {} + impl SystemDecoder for Decoder { + fn decode(&self, word: &str) -> Result; + } + } + + #[test] + fn test_single_word() { + let words = vec![TEST_WORD_1.to_string()]; + let mut decoder = MockDecoder::new(); + decoder + .expect_decode() + .returning(|word| mock_decoding(word)); + + let mut lv_map = LenValueMap::new(); + lv_map.insert_words(words, &decoder).unwrap(); + + let data = lv_map.into_data(); + + assert_eq!(data.len(), 1); + assert!(data.contains_key(&TEST_NUM_1_LEN)); + let data = data.get(&TEST_NUM_1_LEN).unwrap(); + assert!(data.contains_key(&TEST_NUM_1)); + let words = data.get(&TEST_NUM_1).unwrap(); + assert_eq!(words.len(), 1); + assert_eq!(words[0], TEST_WORD_1); + } + + #[test] + fn test_multiple_words() { + let words = get_test_words(); + + let mut decoder = MockDecoder::new(); + decoder + .expect_decode() + .returning(|word| mock_decoding(word)); + + let mut lv_map = LenValueMap::new(); + lv_map.insert_words(words, &decoder).unwrap(); + + let data = lv_map.into_data(); + + assert_eq!(data.len(), 2); // two different lengths + assert!(data.contains_key(&TEST_NUM_1_LEN)); + assert!(data.contains_key(&TEST_NUM_3_LEN)); + let l2 = data.get(&TEST_NUM_1_LEN).unwrap(); + let l4 = data.get(&TEST_NUM_3_LEN).unwrap(); + + assert_eq!(l2.len(), 2); // two numbers + assert_eq!(l4.len(), 1); // one number + assert!(l2.contains_key(&TEST_NUM_1)); + assert!(l2.contains_key(&TEST_NUM_2)); + assert!(l4.contains_key(&TEST_NUM_3)); + + let words = l2.get(&TEST_NUM_1).unwrap(); + assert_eq!(words.len(), 1); + assert_eq!(words[0], TEST_WORD_1); + + let words = l2.get(&TEST_NUM_2).unwrap(); + assert_eq!(words.len(), 1); + assert_eq!(words[0], TEST_WORD_2); + + let words = l4.get(&TEST_NUM_3).unwrap(); + assert_eq!(words.len(), 2); + assert!(words.contains(&TEST_WORD_3.to_string())); + assert!(words.contains(&TEST_WORD_4.to_string())); + } + + #[test] + fn test_skip_empty_decodes() { + let words = vec![TEST_WORD_1.to_string(), TEST_WORD_2.to_string()]; + let mut decoder = MockDecoder::new(); + decoder.expect_decode().returning(|word| { + if word == TEST_WORD_1 { + DecodedValue::new("".to_string()) + } else { + DecodedValue::new(TEST_NUM_2.to_string()) + } + }); + + let mut lv_map = LenValueMap::new(); + lv_map.insert_words(words, &decoder).unwrap(); + + let data = lv_map.into_data(); + + assert_eq!(data.len(), 1); + assert!(data.contains_key(&TEST_NUM_1_LEN)); + let data = data.get(&TEST_NUM_1_LEN).unwrap(); + assert!(data.contains_key(&TEST_NUM_2)); + let words = data.get(&TEST_NUM_2).unwrap(); + assert_eq!(words.len(), 1); + assert_eq!(words[0], TEST_WORD_2); + } + + #[test] + fn test_decoder_error_propagates() { + let mut decoder = MockDecoder::new(); + decoder + .expect_decode() + .returning(|_| Err(CodecError::UnexpectedError("boom".into()))); + + let mut map = LenValueMap::new(); + let result = map.insert_words(vec!["x".into()], &decoder); + + assert!(result.is_err()); + } + + // --- build --- + + #[tokio::test] + async fn test_from_stream_success() { + // 1. Setup Mocks (Same as before) + let mut decoder = MockDecoder::new(); + decoder + .expect_decode() + .returning(|word| mock_decoding(word)); + + // 2. Prepare Data + // We wrap the inner Vecs in Ok() because the stream expects Result, RepositoryError> + let batches = vec![ + Ok(vec![TEST_WORD_1.into(), TEST_WORD_2.into()]), + Ok(vec![TEST_WORD_3.into(), TEST_WORD_4.into()]), + ]; + + // 3. Create a Stream from the Vec + // stream::iter converts an IntoIterator into a Stream + let stream = stream::iter(batches); + + // 4. Inject the stream (Dependency Injection) + let map = LenValueMap::from_stream(stream, &decoder) + .await + .expect("Should build map successfully"); + + // 5. Assertions + let data = map.into_data(); + assert_eq!(data.len(), 2); + assert!(data.contains_key(&TEST_NUM_1_LEN)); + assert!(data.contains_key(&TEST_NUM_3_LEN)); + } + + #[tokio::test] + async fn test_from_stream_failure() { + let mut decoder = MockDecoder::new(); + decoder + .expect_decode() + .returning(|word| mock_decoding(word)); + + let batches = vec![ + Ok(vec![TEST_WORD_1.into()]), + Err(RepositoryError::ConnectionFailed), + Ok(vec![TEST_WORD_3.into()]), + ]; + + let stream = stream::iter(batches); + let result = LenValueMap::from_stream(stream, &decoder).await; + + match result { + // We match specifically on the Repository variant and the ConnectionFailed inner error + Err(LenValueMapError::Repository(RepositoryError::ConnectionFailed)) => { + // Success! The correct error type propagated up. + } + // If it's any other error (including a stringified one), we fail + _ => panic!( + "Expected LenValueMapError::Repository(ConnectionFailed), got {:?}", + result + ), + } + } +} diff --git a/lib/src/application.rs b/lib/src/sys_major/removeme.old/application.rs similarity index 100% rename from lib/src/application.rs rename to lib/src/sys_major/removeme.old/application.rs diff --git a/lib/src/application/config.rs b/lib/src/sys_major/removeme.old/application/config.rs similarity index 100% rename from lib/src/application/config.rs rename to lib/src/sys_major/removeme.old/application/config.rs diff --git a/lib/src/application/errors.rs b/lib/src/sys_major/removeme.old/application/errors.rs similarity index 100% rename from lib/src/application/errors.rs rename to lib/src/sys_major/removeme.old/application/errors.rs diff --git a/lib/src/application/services.rs b/lib/src/sys_major/removeme.old/application/services.rs similarity index 100% rename from lib/src/application/services.rs rename to lib/src/sys_major/removeme.old/application/services.rs diff --git a/lib/src/application/traits.rs b/lib/src/sys_major/removeme.old/application/traits.rs similarity index 100% rename from lib/src/application/traits.rs rename to lib/src/sys_major/removeme.old/application/traits.rs diff --git a/lib/src/core.rs b/lib/src/sys_major/removeme.old/core.rs similarity index 100% rename from lib/src/core.rs rename to lib/src/sys_major/removeme.old/core.rs diff --git a/lib/src/sys_major/removeme.old/core/entities.rs b/lib/src/sys_major/removeme.old/core/entities.rs new file mode 100644 index 0000000..720d554 --- /dev/null +++ b/lib/src/sys_major/removeme.old/core/entities.rs @@ -0,0 +1,146 @@ +use super::errors::CodecError; +use serde::Serialize; +use std::num::ParseIntError; +use std::ops::Deref; +use std::{collections::HashMap, u64}; + +/// A number encoded as a sequence of words +#[derive(Debug, Clone, Serialize)] +pub struct EncodedPart { + pub value: u64, + pub words: Vec, +} + +/// A way (variant) to split input number +pub type EncodedSplit = Vec; + +/// A number encoded as words, split in multiple ways +#[derive(Debug, Clone, Serialize)] +pub struct EncodedValue(Vec); + +impl EncodedValue { + pub fn new(data: Vec) -> Self { + EncodedValue(data) + } +} + +impl Deref for EncodedValue { + type Target = Vec; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +/// The number value can be encoded as many word sets, +/// but decoded as one number. For partial values +/// and dictionary words (reasonable length), we can use +/// u64 (20-digit number), but the whole input text can +/// be longer than 20 digits, so we operate on String (<= 255). +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct DecodedValue(String); + +impl DecodedValue { + pub fn new(value: String) -> Result { + if value.len() > u8::MAX as usize { + Err(CodecError::TextTooLong(value.len())) + } else { + Ok(Self(value)) + } + } + + pub fn as_str(&self) -> &str { + &self.0 + } + + pub fn parse(&self) -> Result { + self.0.parse() + } + + pub fn len(&self) -> usize { + self.0.len() + } + + pub fn is_empty(&self) -> bool { + self.0.is_empty() + } + + pub fn value_len(&self) -> Result { + if self.len() == 0 { + return Err(CodecError::EmptyValue); + } + DecodedLength::try_from(self.len()) + } +} + +impl PartialEq<&str> for DecodedValue { + fn eq(&self, other: &&str) -> bool { + &self.0 == *other + } +} + +impl PartialEq for &str { + fn eq(&self, other: &DecodedValue) -> bool { + *self == &other.0 + } +} + +#[derive(Debug, Clone, Copy, Eq, PartialEq, Hash)] +pub struct DecodedLength(u8); + +impl DecodedLength { + pub const fn from(value: u8) -> Self { + Self(value) + } +} + +impl TryFrom for DecodedLength { + type Error = CodecError; + fn try_from(value: usize) -> Result { + if value > u8::MAX as usize { + Err(CodecError::ValueLimitExceeded(value)) + } else { + Ok(Self(value as u8)) + } + } +} + +// --- Dictionary --- + +pub type DictEntryId = u64; + +#[derive(Debug, Clone, PartialEq)] +pub struct DictEntry { + pub id: Option, + pub text: String, + pub metadata: HashMap, +} + +impl DictEntry { + pub fn new(id: Option, text: String) -> Self { + DictEntry { + id, + text, + metadata: HashMap::new(), + } + } +} + +#[derive(Debug, Clone)] +pub struct Dict { + pub name: String, + pub entries: HashMap, +} + +impl Dict { + pub fn new(name: String) -> Self { + Dict { + name, + entries: HashMap::new(), + } + } + + pub fn add_entry(&mut self, entry: DictEntry) { + self.entries.insert(entry.id.unwrap(), entry); + } +} diff --git a/lib/src/sys_major/removeme.old/core/errors.rs b/lib/src/sys_major/removeme.old/core/errors.rs new file mode 100644 index 0000000..7f05a37 --- /dev/null +++ b/lib/src/sys_major/removeme.old/core/errors.rs @@ -0,0 +1,31 @@ +use thiserror::Error; + +#[derive(Error, Debug)] +pub enum RepositoryError { + #[error("Data source connection failed")] + ConnectionFailed, + + #[error("'{0}' not found")] + NotFound(String), + + #[error("Storage error: {0}")] + StorageError(String), +} + +#[derive(Debug, Error)] +pub enum CodecError { + #[error("text too long: {0} bytes")] + TextTooLong(usize), + + #[error("value too large: {0}/255")] + ValueLimitExceeded(usize), + + #[error("operation not allowed on empty value")] + EmptyValue, + + #[error("initialization failed")] + InitializationFailed, + + #[error("unexpected error: {0}")] + UnexpectedError(String), +} diff --git a/lib/src/core/sys_major.rs b/lib/src/sys_major/removeme.old/core/sys_major.rs similarity index 100% rename from lib/src/core/sys_major.rs rename to lib/src/sys_major/removeme.old/core/sys_major.rs diff --git a/lib/src/core/sys_major/decoder.rs b/lib/src/sys_major/removeme.old/core/sys_major/decoder.rs similarity index 100% rename from lib/src/core/sys_major/decoder.rs rename to lib/src/sys_major/removeme.old/core/sys_major/decoder.rs diff --git a/lib/src/core/sys_major/decoder_tests.rs b/lib/src/sys_major/removeme.old/core/sys_major/decoder_tests.rs similarity index 100% rename from lib/src/core/sys_major/decoder_tests.rs rename to lib/src/sys_major/removeme.old/core/sys_major/decoder_tests.rs diff --git a/lib/src/core/sys_major/encoder.rs b/lib/src/sys_major/removeme.old/core/sys_major/encoder.rs similarity index 100% rename from lib/src/core/sys_major/encoder.rs rename to lib/src/sys_major/removeme.old/core/sys_major/encoder.rs diff --git a/lib/src/core/sys_major/lvmap.rs b/lib/src/sys_major/removeme.old/core/sys_major/lvmap.rs similarity index 100% rename from lib/src/core/sys_major/lvmap.rs rename to lib/src/sys_major/removeme.old/core/sys_major/lvmap.rs diff --git a/lib/src/core/sys_major/rules_en.rs b/lib/src/sys_major/removeme.old/core/sys_major/rules_en.rs similarity index 100% rename from lib/src/core/sys_major/rules_en.rs rename to lib/src/sys_major/removeme.old/core/sys_major/rules_en.rs diff --git a/lib/src/core/sys_major/rules_pl.rs b/lib/src/sys_major/removeme.old/core/sys_major/rules_pl.rs similarity index 100% rename from lib/src/core/sys_major/rules_pl.rs rename to lib/src/sys_major/removeme.old/core/sys_major/rules_pl.rs diff --git a/lib/src/core/system.rs b/lib/src/sys_major/removeme.old/core/system.rs similarity index 100% rename from lib/src/core/system.rs rename to lib/src/sys_major/removeme.old/core/system.rs diff --git a/lib/src/core/traits.rs b/lib/src/sys_major/removeme.old/core/traits.rs similarity index 100% rename from lib/src/core/traits.rs rename to lib/src/sys_major/removeme.old/core/traits.rs diff --git a/lib/src/infrastructure.rs b/lib/src/sys_major/removeme.old/infrastructure.rs similarity index 100% rename from lib/src/infrastructure.rs rename to lib/src/sys_major/removeme.old/infrastructure.rs diff --git a/lib/src/infrastructure/errors.rs b/lib/src/sys_major/removeme.old/infrastructure/errors.rs similarity index 100% rename from lib/src/infrastructure/errors.rs rename to lib/src/sys_major/removeme.old/infrastructure/errors.rs diff --git a/lib/src/infrastructure/json_file_dict_source.rs b/lib/src/sys_major/removeme.old/infrastructure/json_file_dict_source.rs similarity index 100% rename from lib/src/infrastructure/json_file_dict_source.rs rename to lib/src/sys_major/removeme.old/infrastructure/json_file_dict_source.rs diff --git a/lib/src/infrastructure/sqlite_dict_repository.rs b/lib/src/sys_major/removeme.old/infrastructure/sqlite_dict_repository.rs similarity index 100% rename from lib/src/infrastructure/sqlite_dict_repository.rs rename to lib/src/sys_major/removeme.old/infrastructure/sqlite_dict_repository.rs diff --git a/lib/src/presentation.rs b/lib/src/sys_major/removeme.old/presentation.rs similarity index 100% rename from lib/src/presentation.rs rename to lib/src/sys_major/removeme.old/presentation.rs diff --git a/lib/src/sys_major/rules_en.rs b/lib/src/sys_major/rules_en.rs new file mode 100644 index 0000000..78f6cc6 --- /dev/null +++ b/lib/src/sys_major/rules_en.rs @@ -0,0 +1,15 @@ +use super::decoder::{Rule, Rules}; + +pub fn get_rules() -> Rules { + vec![ + Rule { + phoneme_in: "EN".to_string(), + phoneme_out: "2".to_string(), + not_after: vec!["Y".to_string()], + not_before: vec!["X".to_string()], + only_after: vec!["A".to_string()], + only_before: vec!["C".to_string()], + }, + // ...more entries... + ] +} diff --git a/lib/src/sys_major/rules_pl.rs b/lib/src/sys_major/rules_pl.rs new file mode 100644 index 0000000..0cdf587 --- /dev/null +++ b/lib/src/sys_major/rules_pl.rs @@ -0,0 +1,222 @@ +use super::decoder::{Rule, Rules}; + +pub fn get_rules() -> Rules { + vec![ + Rule { + not_after: vec![], + only_after: vec![], + phoneme_in: "S".to_string(), + phoneme_out: "0".to_string(), + not_before: vec!["I".to_string(), "Z".to_string()], + only_before: vec![], + }, + Rule { + not_after: vec![ + "C".to_string(), + "D".to_string(), + "R".to_string(), + "S".to_string(), + ], + only_after: vec![], + phoneme_in: "Z".to_string(), + phoneme_out: "0".to_string(), + not_before: vec!["I".to_string()], + only_before: vec![], + }, + Rule { + not_after: vec![], + only_after: vec![], + phoneme_in: "T".to_string(), + phoneme_out: "1".to_string(), + not_before: vec![], + only_before: vec![], + }, + Rule { + only_after: vec![], + not_after: vec![], + phoneme_in: "D".to_string(), + phoneme_out: "1".to_string(), + not_before: vec!["Z".to_string(), "Ź".to_string(), "Ż".to_string()], + only_before: vec![], + }, + Rule { + not_after: vec![], + only_after: vec![], + phoneme_in: "N".to_string(), + phoneme_out: "2".to_string(), + not_before: vec!["I".to_string()], + only_before: vec![], + }, + Rule { + not_after: vec![], + only_after: vec![], + phoneme_in: "M".to_string(), + phoneme_out: "3".to_string(), + not_before: vec![], + only_before: vec![], + }, + Rule { + not_after: vec![], + only_after: vec![], + phoneme_in: "R".to_string(), + phoneme_out: "4".to_string(), + not_before: vec!["Z".to_string()], + only_before: vec![], + }, + Rule { + not_after: vec![], + only_after: vec![], + phoneme_in: "L".to_string(), + phoneme_out: "5".to_string(), + not_before: vec![], + only_before: vec![], + }, + Rule { + not_after: vec![], + only_after: vec![], + phoneme_in: "J".to_string(), + phoneme_out: "6".to_string(), + not_before: vec![], + only_before: vec![], + }, + Rule { + not_after: vec![], + only_after: vec![], + phoneme_in: "K".to_string(), + phoneme_out: "7".to_string(), + not_before: vec![], + only_before: vec![], + }, + Rule { + not_after: vec![], + only_after: vec![], + phoneme_in: "G".to_string(), + phoneme_out: "7".to_string(), + not_before: vec![], + only_before: vec![], + }, + Rule { + not_after: vec![], + only_after: vec![], + phoneme_in: "F".to_string(), + phoneme_out: "8".to_string(), + not_before: vec![], + only_before: vec![], + }, + Rule { + not_after: vec![], + only_after: vec![], + phoneme_in: "W".to_string(), + phoneme_out: "8".to_string(), + not_before: vec![], + only_before: vec![], + }, + Rule { + not_after: vec![], + only_after: vec![], + phoneme_in: "P".to_string(), + phoneme_out: "9".to_string(), + not_before: vec![], + only_before: vec![], + }, + Rule { + not_after: vec![], + only_after: vec![], + phoneme_in: "B".to_string(), + phoneme_out: "9".to_string(), + not_before: vec![], + only_before: vec![], + }, + ] +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::core::sys_major::Decoder; + use crate::traits::SystemDecoder; + + #[test] + fn test_major_dict_pl_decode_0_1() { + let decoder = Decoder::new(get_rules()); + let output = decoder.decode("SZSCZ").unwrap(); + assert_eq!(output, "0") + } + + #[test] + fn test_major_dict_pl_decode_0_2() { + let decoder = Decoder::new(get_rules()); + let output = decoder.decode("SZSICZ").unwrap(); + assert_eq!(output, "") + } + + #[test] + fn test_major_dict_pl_decode_0_3() { + let decoder = Decoder::new(get_rules()); + let output = decoder.decode("SZCZRZZCZDZSZ").unwrap(); + assert_eq!(output, "0") + } + + #[test] + fn test_major_dict_pl_decode_0_4() { + let decoder = Decoder::new(get_rules()); + let output = decoder.decode("SZCZRZZICZDZSZ").unwrap(); + assert_eq!(output, "") + } + + #[test] + fn test_major_dict_pl_decode_1_1() { + let decoder = Decoder::new(get_rules()); + let output = decoder.decode("SZTCZ").unwrap(); + assert_eq!(output, "1") + } + + #[test] + fn test_major_dict_pl_decode_1_2() { + let decoder = Decoder::new(get_rules()); + let output = decoder.decode("DZDŻDŹDDZDŻDŹ").unwrap(); + assert_eq!(output, "1") + } + + #[test] + fn test_major_dict_pl_decode_1_3() { + let decoder = Decoder::new(get_rules()); + let output = decoder.decode("DZDŻDŹDZDZDŻDŹ").unwrap(); + assert_eq!(output, "") + } + + #[test] + fn test_major_dict_pl_decode_2_1() { + let decoder = Decoder::new(get_rules()); + let output = decoder.decode("NINNI").unwrap(); + assert_eq!(output, "2") + } + + #[test] + fn test_major_dict_pl_decode_2_2() { + let decoder = Decoder::new(get_rules()); + let output = decoder.decode("NININI").unwrap(); + assert_eq!(output, "") + } + + #[test] + fn test_major_dict_pl_decode_4_1() { + let decoder = Decoder::new(get_rules()); + let output = decoder.decode("RZRRZ").unwrap(); + assert_eq!(output, "4") + } + + #[test] + fn test_major_dict_pl_decode_4_2() { + let decoder = Decoder::new(get_rules()); + let output = decoder.decode("RZRZRZ").unwrap(); + assert_eq!(output, "") + } + + #[test] + fn test_major_dict_pl_decode_full_1() { + let decoder = Decoder::new(get_rules()); + let output = decoder.decode("ATADANAMARALAJAKAGAFAWAPABA").unwrap(); + assert_eq!(output, "1123456778899") + } +}