Browse Source

WIP: lvmap + encoder

develop-refactor
chodak166 5 months ago
parent
commit
c67bc5b2e9
  1. 2
      lib/src/application/services.rs
  2. 23
      lib/src/core/entities.rs
  3. 8
      lib/src/core/errors.rs
  4. 4
      lib/src/core/sys_major.rs
  5. 23
      lib/src/core/sys_major/encoder.rs
  6. 197
      lib/src/core/sys_major/lvmap.rs
  7. 21
      lib/src/core/traits.rs
  8. 2
      lib/src/infrastructure/sqlite_dict_repository.rs

2
lib/src/application/services.rs

@ -19,7 +19,7 @@ impl<'a, R: DictRepository> DictImporter<'a, R> {
mut source: impl DictSource, mut source: impl DictSource,
) -> Result<(), anyhow::Error> { ) -> Result<(), anyhow::Error> {
// 1. Ensure Dict exists (Logic: Create if new, or maybe clear existing?) // 1. Ensure Dict exists (Logic: Create if new, or maybe clear existing?)
self.repo.create(name).await?; self.repo.create_dict(name).await?;
let mut batch = Vec::with_capacity(self.batch_size); let mut batch = Vec::with_capacity(self.batch_size);

23
lib/src/core/entities.rs

@ -19,6 +19,7 @@ impl DictEntry {
} }
} }
#[derive(Debug, Clone)]
pub struct Dict { pub struct Dict {
pub name: String, pub name: String,
pub entries: HashMap<DictEntryId, DictEntry>, pub entries: HashMap<DictEntryId, DictEntry>,
@ -37,11 +38,19 @@ impl Dict {
} }
} }
// pub struct DecodedItem { #[derive(Debug, Clone)]
// pub value: String, pub struct EncodedItem {
// } pub entry: DictEntry,
}
// pub struct DecodedResult { impl EncodedItem {
// pub input: String, pub fn value(&self) -> &str {
// pub output: Vec<DecodedItem>, &self.entry.text
// } }
}
#[derive(Debug, Clone)]
pub struct EncodingResult {
pub input: String,
pub output: Vec<EncodedItem>,
}

8
lib/src/core/errors.rs

@ -9,3 +9,11 @@ pub enum RepositoryError {
#[error("Storage error: {0}")] #[error("Storage error: {0}")]
StorageError(String), StorageError(String),
} }
#[derive(Error, Debug)]
pub enum EncoderError {
#[error("Encoder initialization failed")]
InitializationFailed,
#[error("Unexpected error: {0}")]
UnexpectedError(String),
}

4
lib/src/core/sys_major.rs

@ -1,4 +1,6 @@
mod decoder; mod decoder;
mod encoder;
mod lvmap;
pub mod rules_en; pub mod rules_en;
pub mod rules_pl; pub mod rules_pl;
@ -6,3 +8,5 @@ pub mod rules_pl;
mod decoder_tests; mod decoder_tests;
pub use decoder::*; pub use decoder::*;
pub use encoder::*;
pub use lvmap::LenValueMap;

23
lib/src/core/sys_major/encoder.rs

@ -0,0 +1,23 @@
use crate::core::{
entities::EncodingResult, errors::EncoderError, sys_major::LenValueMap, traits::*,
};
#[derive(Debug)]
pub struct Encoder {
lv_map: LenValueMap,
}
impl Encoder {
pub fn new(lv_map: LenValueMap) -> Self {
Encoder { lv_map }
}
}
impl SystemEncoder for Encoder {
fn initialize(&self) -> Result<(), EncoderError> {
Ok(())
}
fn encode(&self, word: &str) -> Result<EncodingResult, EncoderError> {
todo!()
}
}

197
lib/src/core/sys_major/lvmap.rs

@ -0,0 +1,197 @@
use std::collections::HashMap;
use crate::core::{DictRepository, SystemEncoder};
// We store words by encoded number length, then encoded value
// Example:
// root:
// - 3:
// - 750:
// - word: klasa
// - word: gilza
// - 849:
// - word: farba
// - 2:
// - 45:
// - word: oral
const DEFAULT_DICT_BATCH_SIZE: usize = 100;
type ValueLength = u8;
type Value = u64;
pub type LenValueData = HashMap<ValueLength, HashMap<Value, Vec<String>>>;
#[derive(Debug)]
pub struct LenValueMap {
map: LenValueData,
batch_size: usize,
}
impl LenValueMap {
pub fn new() -> Self {
LenValueMap {
map: LenValueData::new(),
batch_size: DEFAULT_DICT_BATCH_SIZE,
}
}
pub fn with_data(mut self, data: LenValueData) -> Self {
self.map = data;
self
}
pub fn with_batch_size(mut self, batch_size: usize) -> Self {
self.batch_size = batch_size;
self
}
pub fn build<E: SystemEncoder, R: DictRepository>(encoder: &E, repo: &R) -> Self {
let mut map = LenValueData::new();
LenValueMap::new().with_data(map)
}
pub fn is_empty(&self) -> bool {
self.map.is_empty()
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::core::{entities::*, errors::*};
use std::collections::HashMap;
#[test]
fn test_no_inpout_then_empty_map() {
let repo = MockRepository::new().with_single_word_dict();
let encoder = MockEncoder::new(EncodingResult {
input: "".to_string(),
output: vec![],
});
let lv_map = LenValueMap::build(&encoder, &repo);
assert!(lv_map.is_empty());
}
// #[test]
// fn test_empty_dict_then_no_output() {
// let dict = Dict {
// name: "test".to_string(),
// entries: HashMap::new(),
// };
// let repo = MockRepository::new(dict);
// let encoder = Encoder::new(&repo);
// let result = encoder.encode("test").unwrap();
// assert!(result.output.is_empty());
// }
// #[test]
// fn test_encode_single_digit() {
// let dict = get_single_word_dict();
// let repo = MockRepository::new(dict);
// let encoder = Encoder::new(&repo).with_batch_size(1);
// let result = encoder.encode("test").unwrap();
// assert!(result.output.is_empty());
// }
// ---------- Helpers ----------
fn get_single_word_dict() -> Dict {
Dict {
name: "test_dict".to_string(),
entries: HashMap::from([(
1,
DictEntry {
id: Some(1),
text: "test_word_1".to_string(),
metadata: HashMap::new(),
},
)]),
}
}
// ---------- Mocks ----------
struct MockEncoder {
result: EncodingResult,
}
impl MockEncoder {
fn new(result: EncodingResult) -> Self {
MockEncoder { result }
}
}
impl SystemEncoder for MockEncoder {
fn initialize(&self) -> Result<(), EncoderError> {
Ok(())
}
fn encode(&self, _word: &str) -> Result<EncodingResult, EncoderError> {
Ok(self.result.clone())
}
}
struct MockRepository {
dict: Dict,
}
impl MockRepository {
pub fn new() -> Self {
MockRepository {
dict: Dict::new("test_dict".to_string()),
}
}
pub fn with_single_word_dict(mut self) -> Self {
self.dict = get_single_word_dict();
self
}
}
#[async_trait::async_trait]
impl DictRepository for MockRepository {
async fn create_dict(&self, _name: &str) -> Result<(), RepositoryError> {
Ok(())
}
async fn save_entries(
&self,
_dict_name: &str,
_entries: &[DictEntry],
) -> Result<(), RepositoryError> {
todo!()
}
async fn fetch_many(
&self,
_name: &str,
limit: Option<u32>,
offset: Option<u32>,
) -> Result<Dict, RepositoryError> {
let offset = offset.unwrap_or(0) as usize;
let limit = limit.unwrap_or(u32::MAX) as usize;
let mut entries_vec: Vec<_> = self.dict.entries.iter().collect();
entries_vec.sort_by_key(|&(id, _)| *id);
let paginated = entries_vec.into_iter().skip(offset).take(limit);
let paginated_map: HashMap<u64, DictEntry> = paginated
.map(|(id, entry)| {
(
*id,
DictEntry {
id: entry.id,
text: entry.text.clone(),
metadata: entry.metadata.clone(),
},
)
})
.collect();
Ok(Dict {
name: self.dict.name.clone(),
entries: paginated_map,
})
}
}
}

21
lib/src/core/traits.rs

@ -1,18 +1,27 @@
use crate::core::entities::EncodingResult;
use crate::core::errors::EncoderError;
use super::entities::{Dict, DictEntry}; use super::entities::{Dict, DictEntry};
use super::errors::RepositoryError; use super::errors::RepositoryError;
/// The number value can be encoded as many word sets,
/// but decoded as one number. For partial values, we can use
/// u64, but for the whole decoded value that may be very long,
/// we need a string.
type DecodedValue = String;
pub trait SystemDecoder { pub trait SystemDecoder {
fn decode(&self, word: &str) -> String; fn decode(&self, word: &str) -> DecodedValue;
} }
// pub trait SystenDecoder { pub trait SystemEncoder {
// fn initialize(&self) -> Result<(), anyhow::Error>; fn initialize(&self) -> Result<(), EncoderError>;
fn encode(&self, word: &str) -> Result<EncodingResult, EncoderError>;
// } }
#[async_trait::async_trait] #[async_trait::async_trait]
pub trait DictRepository: Send + Sync { pub trait DictRepository: Send + Sync {
async fn create(&self, name: &str) -> Result<(), RepositoryError>; async fn create_dict(&self, name: &str) -> Result<(), RepositoryError>;
/// "Upsert" logic: /// "Upsert" logic:
/// - If entry exists (by text), update metadata. /// - If entry exists (by text), update metadata.

2
lib/src/infrastructure/sqlite_dict_repository.rs

@ -87,7 +87,7 @@ impl SqliteDictRepository {
#[async_trait::async_trait] #[async_trait::async_trait]
impl DictRepository for SqliteDictRepository { impl DictRepository for SqliteDictRepository {
async fn create(&self, name: &str) -> Result<(), RepositoryError> { async fn create_dict(&self, name: &str) -> Result<(), RepositoryError> {
sqlx::query("INSERT OR IGNORE INTO dictionaries (name) VALUES (?)") sqlx::query("INSERT OR IGNORE INTO dictionaries (name) VALUES (?)")
.bind(name) .bind(name)
.execute(&self.pool) .execute(&self.pool)

Loading…
Cancel
Save