Browse Source

WIP: lv_map tests

develop-refactor
chodak166 5 months ago
parent
commit
05be6054a2
  1. 12
      example_dict.json
  2. 7
      lib/Cargo.toml
  3. 8
      lib/src/core/errors.rs
  4. 279
      lib/src/core/sys_major/lvmap.rs
  5. 2
      lib/src/core/traits.rs

12
example_dict.json

@ -1,12 +0,0 @@
[
{"word": "hello", "metadata": {"type": "greeting", "language": "english"}},
{"word": "world", "metadata": {"type": "noun", "language": "english"}},
{"word": "rust", "metadata": {"type": "programming_language", "paradigm": "systems"}},
{"word": "programming", "metadata": {"type": "verb", "context": "computing"}},
{"word": "database", "metadata": {"type": "noun", "context": "data_storage"}},
{"word": "sqlite", "metadata": {"type": "database_engine", "features": ["embedded", "sql"]}},
{"word": "json", "metadata": {"type": "data_format", "standard": "RFC 8259"}},
{"word": "import", "metadata": {"type": "verb", "context": "data_operations"}},
{"word": "dictionary", "metadata": {"type": "noun", "context": "reference"}},
{"word": "example", "metadata": {"type": "noun", "usage": "demonstration"}}
]

7
lib/Cargo.toml

@ -14,7 +14,10 @@ anyhow = "1.0"
serde = { version = "1.0", features = ["derive"] }
serde_json = "1.0"
chrono = { version = "0.4", features = ["serde"] }
thiserror = "1.0"
thiserror = "2.0"
async-trait = "0.1"
parking_lot = "0.12"
sqlx = { version = "0.7", features = ["runtime-tokio", "sqlite", "chrono", "migrate"] }
sqlx = { version = "0.8.6", features = ["runtime-tokio", "sqlite", "chrono", "migrate"] }
[dev-dependencies]
mockall = "0.14.0"

8
lib/src/core/errors.rs

@ -17,3 +17,11 @@ pub enum EncoderError {
#[error("Unexpected error: {0}")]
UnexpectedError(String),
}
#[derive(Error, Debug)]
pub enum DecoderError {
#[error("Decoder input error")]
InputError,
#[error("Unexpected error: {0}")]
UnexpectedError(String),
}

279
lib/src/core/sys_major/lvmap.rs

@ -1,6 +1,10 @@
use std::collections::HashMap;
use crate::core::{DictRepository, SystemEncoder};
use anyhow::Error;
use crate::core::errors::RepositoryError;
use crate::core::traits::DecodedValue;
use crate::core::{DictRepository, SystemDecoder};
// We store words by encoded number length, then encoded value
// Example:
@ -15,6 +19,7 @@ use crate::core::{DictRepository, SystemEncoder};
// - 45:
// - word: oral
// Words are fetched from DictRepository in batches
const DEFAULT_DICT_BATCH_SIZE: usize = 100;
type ValueLength = u8;
@ -23,35 +28,66 @@ pub type LenValueData = HashMap<ValueLength, HashMap<Value, Vec<String>>>;
#[derive(Debug)]
pub struct LenValueMap {
map: LenValueData,
batch_size: usize,
data: LenValueData,
}
impl LenValueMap {
pub fn new() -> Self {
LenValueMap {
map: LenValueData::new(),
batch_size: DEFAULT_DICT_BATCH_SIZE,
data: LenValueData::new(),
}
}
pub fn with_data(mut self, data: LenValueData) -> Self {
self.map = data;
self.data = data;
self
}
pub fn with_batch_size(mut self, batch_size: usize) -> Self {
self.batch_size = batch_size;
self
pub fn data(&self) -> &LenValueData {
&self.data
}
pub fn into_data(self) -> LenValueData {
self.data
}
pub fn build<E: SystemEncoder, R: DictRepository>(encoder: &E, repo: &R) -> Self {
let mut map = LenValueData::new();
pub fn insert_words(
&mut self,
words: Vec<String>,
decoder: &impl SystemDecoder,
) -> Result<(), Error> {
for word in words {
if word.is_empty() {
continue;
}
let decoded = decoder.decode(&word);
let int_value = decoded.parse::<u64>()?;
let len = decoded.len() as u8;
if let Some(len_item) = self.data.get_mut(&len) {
if let Some(value_item) = len_item.get_mut(&int_value) {
value_item.push(word);
} else {
len_item.insert(int_value, vec![word]);
}
} else {
self.data
.insert(len, HashMap::from([(int_value, vec![word])]));
}
}
Ok(())
}
LenValueMap::new().with_data(map)
pub fn from_dict(decoder: &impl SystemDecoder, repo: &impl DictRepository) -> Self {
Self::build(decoder, repo, DEFAULT_DICT_BATCH_SIZE) // is that common approach?
}
pub fn is_empty(&self) -> bool {
self.map.is_empty()
self.data.is_empty()
}
fn build(decoder: &impl SystemDecoder, repo: &impl DictRepository, batch_size: usize) -> Self {
todo!()
}
}
@ -61,137 +97,118 @@ mod tests {
use crate::core::{entities::*, errors::*};
use std::collections::HashMap;
#[test]
fn test_no_inpout_then_empty_map() {
let repo = MockRepository::new().with_single_word_dict();
let encoder = MockEncoder::new(EncodingResult {
input: "".to_string(),
output: vec![],
});
let lv_map = LenValueMap::build(&encoder, &repo);
assert!(lv_map.is_empty());
use mockall::automock;
use mockall::{mock, predicate::*};
const TEST_WORD_1: &str = "test_word_1";
const TEST_WORD_2: &str = "test_word_2";
const TEST_WORD_3: &str = "test_word_3";
const TEST_WORD_4: &str = "test_word_4";
const TEST_NUM_1: u64 = 12;
const TEST_NUM_2: u64 = 34;
const TEST_NUM_3: u64 = 9876;
const TEST_NUM_1_LEN: u8 = 2;
const TEST_NUM_3_LEN: u8 = 4;
fn get_test_dec_map() -> HashMap<String, DecodedValue> {
HashMap::from([
(TEST_WORD_1.to_string(), TEST_NUM_1.to_string()),
(TEST_WORD_2.to_string(), TEST_NUM_2.to_string()),
(TEST_WORD_3.to_string(), TEST_NUM_3.to_string()),
(TEST_WORD_4.to_string(), TEST_NUM_3.to_string()),
])
}
// #[test]
// fn test_empty_dict_then_no_output() {
// let dict = Dict {
// name: "test".to_string(),
// entries: HashMap::new(),
// };
// let repo = MockRepository::new(dict);
// let encoder = Encoder::new(&repo);
// let result = encoder.encode("test").unwrap();
// assert!(result.output.is_empty());
// }
// #[test]
// fn test_encode_single_digit() {
// let dict = get_single_word_dict();
// let repo = MockRepository::new(dict);
// let encoder = Encoder::new(&repo).with_batch_size(1);
// let result = encoder.encode("test").unwrap();
// assert!(result.output.is_empty());
// }
// ---------- Helpers ----------
fn get_single_word_dict() -> Dict {
Dict {
name: "test_dict".to_string(),
entries: HashMap::from([(
1,
DictEntry {
id: Some(1),
text: "test_word_1".to_string(),
metadata: HashMap::new(),
},
)]),
}
fn mock_decoding(word: &str) -> DecodedValue {
get_test_dec_map().remove(word).unwrap()
}
// ---------- Mocks ----------
struct MockEncoder {
result: EncodingResult,
fn get_test_words() -> Vec<String> {
vec![
TEST_WORD_1.to_string(),
TEST_WORD_2.to_string(),
TEST_WORD_3.to_string(),
TEST_WORD_4.to_string(),
]
}
impl MockEncoder {
fn new(result: EncodingResult) -> Self {
MockEncoder { result }
mock! {
pub Decoder {}
impl SystemDecoder for Decoder {
fn decode(&self, word: &str) -> DecodedValue;
}
}
impl SystemEncoder for MockEncoder {
fn initialize(&self) -> Result<(), EncoderError> {
Ok(())
}
fn encode(&self, _word: &str) -> Result<EncodingResult, EncoderError> {
Ok(self.result.clone())
}
}
#[test]
fn test_insert_words_empty() {
let words = vec![];
let mut decoder = MockDecoder::new();
decoder.expect_decode().returning(|_| DecodedValue::new());
struct MockRepository {
dict: Dict,
let mut lv_map = LenValueMap::new();
lv_map.insert_words(words, &decoder).unwrap();
assert!(lv_map.is_empty());
}
impl MockRepository {
pub fn new() -> Self {
MockRepository {
dict: Dict::new("test_dict".to_string()),
}
}
pub fn with_single_word_dict(mut self) -> Self {
self.dict = get_single_word_dict();
self
}
#[test]
fn test_single_word() {
let words = vec![TEST_WORD_1.to_string()];
let mut decoder = MockDecoder::new();
decoder
.expect_decode()
.returning(|word| mock_decoding(word));
let mut lv_map = LenValueMap::new();
lv_map.insert_words(words, &decoder).unwrap();
let data = lv_map.into_data();
assert_eq!(data.len(), 1);
assert!(data.contains_key(&TEST_NUM_1_LEN));
let data = data.get(&TEST_NUM_1_LEN).unwrap();
assert!(data.contains_key(&TEST_NUM_1));
let words = data.get(&TEST_NUM_1).unwrap();
assert_eq!(words.len(), 1);
assert_eq!(words[0], TEST_WORD_1);
}
#[async_trait::async_trait]
impl DictRepository for MockRepository {
async fn create_dict(&self, _name: &str) -> Result<(), RepositoryError> {
Ok(())
}
async fn save_entries(
&self,
_dict_name: &str,
_entries: &[DictEntry],
) -> Result<(), RepositoryError> {
todo!()
}
async fn fetch_many(
&self,
_name: &str,
limit: Option<u32>,
offset: Option<u32>,
) -> Result<Dict, RepositoryError> {
let offset = offset.unwrap_or(0) as usize;
let limit = limit.unwrap_or(u32::MAX) as usize;
let mut entries_vec: Vec<_> = self.dict.entries.iter().collect();
entries_vec.sort_by_key(|&(id, _)| *id);
let paginated = entries_vec.into_iter().skip(offset).take(limit);
let paginated_map: HashMap<u64, DictEntry> = paginated
.map(|(id, entry)| {
(
*id,
DictEntry {
id: entry.id,
text: entry.text.clone(),
metadata: entry.metadata.clone(),
},
)
})
.collect();
Ok(Dict {
name: self.dict.name.clone(),
entries: paginated_map,
})
}
#[test]
fn test_multiple_words() {
let words = get_test_words();
let mut decoder = MockDecoder::new();
decoder
.expect_decode()
.returning(|word| mock_decoding(word));
let mut lv_map = LenValueMap::new();
lv_map.insert_words(words, &decoder).unwrap();
let data = lv_map.into_data();
assert_eq!(data.len(), 2); // two different lengths
assert!(data.contains_key(&TEST_NUM_1_LEN));
assert!(data.contains_key(&TEST_NUM_3_LEN));
let l2 = data.get(&TEST_NUM_1_LEN).unwrap();
let l4 = data.get(&TEST_NUM_3_LEN).unwrap();
assert_eq!(l2.len(), 2); // two numbers
assert_eq!(l4.len(), 1); // one number
assert!(l2.contains_key(&TEST_NUM_1));
assert!(l2.contains_key(&TEST_NUM_2));
assert!(l4.contains_key(&TEST_NUM_3));
let words = l2.get(&TEST_NUM_1).unwrap();
assert_eq!(words.len(), 1);
assert_eq!(words[0], TEST_WORD_1);
let words = l2.get(&TEST_NUM_2).unwrap();
assert_eq!(words.len(), 1);
assert_eq!(words[0], TEST_WORD_2);
let words = l4.get(&TEST_NUM_3).unwrap();
assert_eq!(words.len(), 2);
assert_eq!(words[0], TEST_WORD_3);
assert_eq!(words[1], TEST_WORD_4);
}
}

2
lib/src/core/traits.rs

@ -8,7 +8,7 @@ use super::errors::RepositoryError;
/// but decoded as one number. For partial values, we can use
/// u64, but for the whole decoded value that may be very long,
/// we need a string.
type DecodedValue = String;
pub type DecodedValue = String;
pub trait SystemDecoder {
fn decode(&self, word: &str) -> DecodedValue;

Loading…
Cancel
Save