From ff94981ddeb35315e8f89df5bebe1ecf002b0d90 Mon Sep 17 00:00:00 2001 From: chodak166 Date: Sun, 16 Nov 2025 19:49:06 +0100 Subject: [PATCH] WIP project structure and encoder first impl --- lib/Cargo.toml | 1 + lib/src/core/major.rs | 5 + lib/src/core/major/dict_en.rs | 15 +++ lib/src/core/major/dict_pl.rs | 15 +++ lib/src/core/major/encoder.rs | 113 ++++++++++++++++++ lib/src/core/major/tests.rs | 11 ++ lib/src/core/major_system_encoder.rs | 23 ---- lib/src/core/mod.rs | 8 +- lib/src/core/system.rs | 6 +- lib/src/core/{system_encoder.rs => traits.rs} | 0 10 files changed, 167 insertions(+), 30 deletions(-) create mode 100644 lib/src/core/major.rs create mode 100644 lib/src/core/major/dict_en.rs create mode 100644 lib/src/core/major/dict_pl.rs create mode 100644 lib/src/core/major/encoder.rs create mode 100644 lib/src/core/major/tests.rs delete mode 100644 lib/src/core/major_system_encoder.rs rename lib/src/core/{system_encoder.rs => traits.rs} (100%) diff --git a/lib/Cargo.toml b/lib/Cargo.toml index 79650cf..3dc33c3 100644 --- a/lib/Cargo.toml +++ b/lib/Cargo.toml @@ -4,3 +4,4 @@ version = "0.1.0" edition = "2024" [dependencies] +once_cell = "1.21.3" diff --git a/lib/src/core/major.rs b/lib/src/core/major.rs new file mode 100644 index 0000000..9b3e0a6 --- /dev/null +++ b/lib/src/core/major.rs @@ -0,0 +1,5 @@ +pub mod dict_en; +pub mod dict_pl; +mod encoder; + +pub use encoder::*; diff --git a/lib/src/core/major/dict_en.rs b/lib/src/core/major/dict_en.rs new file mode 100644 index 0000000..3068233 --- /dev/null +++ b/lib/src/core/major/dict_en.rs @@ -0,0 +1,15 @@ +use crate::core::major::{Dict, DictEntry}; + +pub fn get_dict() -> Dict { + vec![ + DictEntry { + phoneme_in: "EN".to_string(), + phoneme_out: "2".to_string(), + not_after: vec!["Y".to_string()], + not_before: vec!["X".to_string()], + only_after: vec!["A".to_string()], + only_before: vec!["C".to_string()], + }, + // ...more entries... + ] +} diff --git a/lib/src/core/major/dict_pl.rs b/lib/src/core/major/dict_pl.rs new file mode 100644 index 0000000..2cdfab3 --- /dev/null +++ b/lib/src/core/major/dict_pl.rs @@ -0,0 +1,15 @@ +use crate::core::major::{Dict, DictEntry}; + +pub fn get_dict() -> Dict { + vec![ + DictEntry { + phoneme_in: "PL".to_string(), + phoneme_out: "2".to_string(), + not_after: vec!["Y".to_string()], + not_before: vec!["X".to_string()], + only_after: vec!["A".to_string()], + only_before: vec!["C".to_string()], + }, + // ...more entries... + ] +} diff --git a/lib/src/core/major/encoder.rs b/lib/src/core/major/encoder.rs new file mode 100644 index 0000000..ed279ae --- /dev/null +++ b/lib/src/core/major/encoder.rs @@ -0,0 +1,113 @@ +use crate::core::traits::SystemEncoder; + +#[derive(Debug, Default, Clone)] +pub struct DictEntry { + pub phoneme_in: String, + pub phoneme_out: String, + + pub not_before: Vec, + pub not_after: Vec, + + pub only_before: Vec, + pub only_after: Vec, +} + +pub type Dict = Vec; + +/// (index, encoded value) +type DictMatches = Vec<(usize, String)>; + +pub struct Encoder { + dict: Dict, +} + +impl Encoder { + pub fn new(dict: Dict) -> Self { + Encoder { dict: dict } + } + + fn match_entry(&self, entry: &DictEntry, word: &str) -> DictMatches { + word.match_indices(&entry.phoneme_in) + .filter(|(index, _)| self.is_context_matched(&entry, &word, *index)) + .map(|(index, _)| (index, entry.phoneme_out.clone())) + .collect() + } + + fn is_context_matched(&self, entry: &DictEntry, word: &str, index: usize) -> bool { + let before_context = &word[..index]; + let after_context = &word[index + entry.phoneme_in.len()..]; + dbg!(&before_context); + dbg!(&after_context); + + if entry + .not_after + .iter() + .any(|prefix| before_context.ends_with(prefix)) + { + return false; + } + + if entry + .not_before + .iter() + .any(|suffix| after_context.starts_with(suffix)) + { + return false; + } + + if entry + .only_after + .iter() + .all(|prefix| !before_context.ends_with(prefix)) + { + return false; + } + + if entry + .only_before + .iter() + .all(|suffix| !after_context.starts_with(suffix)) + { + return false; + } + + true + } +} + +impl SystemEncoder for Encoder { + fn encode(&self, word: &str) -> String { + let mut matches: DictMatches = self + .dict + .iter() + .flat_map(|entry| self.match_entry(&entry, &word)) + .collect(); + + matches.sort_by_key(|&(pos, _)| pos); + dbg!(&matches); + matches.into_iter().map(|(_, value)| value).collect() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + fn create_basic_dict() -> Dict { + vec![DictEntry { + phoneme_in: "B".to_string(), + phoneme_out: "2".to_string(), + not_after: vec!["Y".to_string()], + not_before: vec!["X".to_string()], + only_after: vec!["A".to_string()], + only_before: vec!["C".to_string()], + }] + } + + #[test] + fn test_single_symbol_encoding_all_reqirements_met() { + let encoder = Encoder::new(create_basic_dict()); + let output = encoder.encode("ABC"); + assert_eq!(output, "2") + } +} diff --git a/lib/src/core/major/tests.rs b/lib/src/core/major/tests.rs new file mode 100644 index 0000000..12c3af6 --- /dev/null +++ b/lib/src/core/major/tests.rs @@ -0,0 +1,11 @@ + +#[cfg(test)] +mod tests { + use super::*; + + // #[test] + // fn test_processing() { + // let processor = TextProcessor::new(">> "); + // assert_eq!(processor.process("hello"), ">> HELLO"); + // } +} diff --git a/lib/src/core/major_system_encoder.rs b/lib/src/core/major_system_encoder.rs deleted file mode 100644 index 62f66b2..0000000 --- a/lib/src/core/major_system_encoder.rs +++ /dev/null @@ -1,23 +0,0 @@ -use crate::core::SystemEncoder; - -pub struct MajorEncoder { - dict: String, // TODO -} - -impl MajorEncoder { - pub fn new(dict: &str) -> Self { - MajorEncoder { - dict: String::from(dict), - } - } -} - -impl SystemEncoder for MajorEncoder { - fn encode(&self, word: &str) -> String { - let num_word: String = word - .chars() - .map(|c| c.to_digit(10).unwrap_or(0).to_string()) - .collect(); - format!("{}_{} -> {}", word, self.dict, num_word) - } -} diff --git a/lib/src/core/mod.rs b/lib/src/core/mod.rs index 177eae0..2cf7ef2 100644 --- a/lib/src/core/mod.rs +++ b/lib/src/core/mod.rs @@ -1,7 +1,7 @@ -pub mod major_system_encoder; +pub mod major; pub mod system; -pub mod system_encoder; +pub mod traits; -pub use self::major_system_encoder::*; +pub use self::major::*; pub use self::system::*; -pub use self::system_encoder::*; +pub use self::traits::*; diff --git a/lib/src/core/system.rs b/lib/src/core/system.rs index fe6579a..2f0dd13 100644 --- a/lib/src/core/system.rs +++ b/lib/src/core/system.rs @@ -1,5 +1,5 @@ -use crate::core::MajorEncoder; use crate::core::SystemEncoder; +use crate::core::major; #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum System { @@ -9,7 +9,7 @@ pub enum System { pub fn create_encoder(system: &System) -> Box { match system { - System::MajorPl => Box::new(MajorEncoder::new("dict-major-pl")), - System::MajorEn => Box::new(MajorEncoder::new("dict-major-en")), + System::MajorPl => Box::new(major::Encoder::new(major::dict_pl::get_dict())), + System::MajorEn => Box::new(major::Encoder::new(major::dict_en::get_dict())), } } diff --git a/lib/src/core/system_encoder.rs b/lib/src/core/traits.rs similarity index 100% rename from lib/src/core/system_encoder.rs rename to lib/src/core/traits.rs