|
|
|
|
@ -1,6 +1,10 @@
|
|
|
|
|
use std::collections::HashMap; |
|
|
|
|
|
|
|
|
|
use crate::core::{DictRepository, SystemEncoder}; |
|
|
|
|
use anyhow::Error; |
|
|
|
|
|
|
|
|
|
use crate::core::errors::RepositoryError; |
|
|
|
|
use crate::core::traits::DecodedValue; |
|
|
|
|
use crate::core::{DictRepository, SystemDecoder}; |
|
|
|
|
|
|
|
|
|
// We store words by encoded number length, then encoded value
|
|
|
|
|
// Example:
|
|
|
|
|
@ -15,6 +19,7 @@ use crate::core::{DictRepository, SystemEncoder};
|
|
|
|
|
// - 45:
|
|
|
|
|
// - word: oral
|
|
|
|
|
|
|
|
|
|
// Words are fetched from DictRepository in batches
|
|
|
|
|
const DEFAULT_DICT_BATCH_SIZE: usize = 100; |
|
|
|
|
|
|
|
|
|
type ValueLength = u8; |
|
|
|
|
@ -23,35 +28,66 @@ pub type LenValueData = HashMap<ValueLength, HashMap<Value, Vec<String>>>;
|
|
|
|
|
|
|
|
|
|
#[derive(Debug)] |
|
|
|
|
pub struct LenValueMap { |
|
|
|
|
map: LenValueData, |
|
|
|
|
batch_size: usize, |
|
|
|
|
data: LenValueData, |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
impl LenValueMap { |
|
|
|
|
pub fn new() -> Self { |
|
|
|
|
LenValueMap { |
|
|
|
|
map: LenValueData::new(), |
|
|
|
|
batch_size: DEFAULT_DICT_BATCH_SIZE, |
|
|
|
|
data: LenValueData::new(), |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
pub fn with_data(mut self, data: LenValueData) -> Self { |
|
|
|
|
self.map = data; |
|
|
|
|
self.data = data; |
|
|
|
|
self |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
pub fn with_batch_size(mut self, batch_size: usize) -> Self { |
|
|
|
|
self.batch_size = batch_size; |
|
|
|
|
self |
|
|
|
|
pub fn data(&self) -> &LenValueData { |
|
|
|
|
&self.data |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
pub fn into_data(self) -> LenValueData { |
|
|
|
|
self.data |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
pub fn build<E: SystemEncoder, R: DictRepository>(encoder: &E, repo: &R) -> Self { |
|
|
|
|
let mut map = LenValueData::new(); |
|
|
|
|
pub fn insert_words( |
|
|
|
|
&mut self, |
|
|
|
|
words: Vec<String>, |
|
|
|
|
decoder: &impl SystemDecoder, |
|
|
|
|
) -> Result<(), Error> { |
|
|
|
|
for word in words { |
|
|
|
|
if word.is_empty() { |
|
|
|
|
continue; |
|
|
|
|
} |
|
|
|
|
let decoded = decoder.decode(&word); |
|
|
|
|
let int_value = decoded.parse::<u64>()?; |
|
|
|
|
let len = decoded.len() as u8; |
|
|
|
|
|
|
|
|
|
if let Some(len_item) = self.data.get_mut(&len) { |
|
|
|
|
if let Some(value_item) = len_item.get_mut(&int_value) { |
|
|
|
|
value_item.push(word); |
|
|
|
|
} else { |
|
|
|
|
len_item.insert(int_value, vec![word]); |
|
|
|
|
} |
|
|
|
|
} else { |
|
|
|
|
self.data |
|
|
|
|
.insert(len, HashMap::from([(int_value, vec![word])])); |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
Ok(()) |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
LenValueMap::new().with_data(map) |
|
|
|
|
pub fn from_dict(decoder: &impl SystemDecoder, repo: &impl DictRepository) -> Self { |
|
|
|
|
Self::build(decoder, repo, DEFAULT_DICT_BATCH_SIZE) // is that common approach?
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
pub fn is_empty(&self) -> bool { |
|
|
|
|
self.map.is_empty() |
|
|
|
|
self.data.is_empty() |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
fn build(decoder: &impl SystemDecoder, repo: &impl DictRepository, batch_size: usize) -> Self { |
|
|
|
|
todo!() |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
@ -61,137 +97,118 @@ mod tests {
|
|
|
|
|
use crate::core::{entities::*, errors::*}; |
|
|
|
|
use std::collections::HashMap; |
|
|
|
|
|
|
|
|
|
#[test] |
|
|
|
|
fn test_no_inpout_then_empty_map() { |
|
|
|
|
let repo = MockRepository::new().with_single_word_dict(); |
|
|
|
|
let encoder = MockEncoder::new(EncodingResult { |
|
|
|
|
input: "".to_string(), |
|
|
|
|
output: vec![], |
|
|
|
|
}); |
|
|
|
|
let lv_map = LenValueMap::build(&encoder, &repo); |
|
|
|
|
assert!(lv_map.is_empty()); |
|
|
|
|
use mockall::automock; |
|
|
|
|
use mockall::{mock, predicate::*}; |
|
|
|
|
|
|
|
|
|
const TEST_WORD_1: &str = "test_word_1"; |
|
|
|
|
const TEST_WORD_2: &str = "test_word_2"; |
|
|
|
|
const TEST_WORD_3: &str = "test_word_3"; |
|
|
|
|
const TEST_WORD_4: &str = "test_word_4"; |
|
|
|
|
const TEST_NUM_1: u64 = 12; |
|
|
|
|
const TEST_NUM_2: u64 = 34; |
|
|
|
|
const TEST_NUM_3: u64 = 9876; |
|
|
|
|
const TEST_NUM_1_LEN: u8 = 2; |
|
|
|
|
const TEST_NUM_3_LEN: u8 = 4; |
|
|
|
|
|
|
|
|
|
fn get_test_dec_map() -> HashMap<String, DecodedValue> { |
|
|
|
|
HashMap::from([ |
|
|
|
|
(TEST_WORD_1.to_string(), TEST_NUM_1.to_string()), |
|
|
|
|
(TEST_WORD_2.to_string(), TEST_NUM_2.to_string()), |
|
|
|
|
(TEST_WORD_3.to_string(), TEST_NUM_3.to_string()), |
|
|
|
|
(TEST_WORD_4.to_string(), TEST_NUM_3.to_string()), |
|
|
|
|
]) |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// #[test]
|
|
|
|
|
// fn test_empty_dict_then_no_output() {
|
|
|
|
|
// let dict = Dict {
|
|
|
|
|
// name: "test".to_string(),
|
|
|
|
|
// entries: HashMap::new(),
|
|
|
|
|
// };
|
|
|
|
|
// let repo = MockRepository::new(dict);
|
|
|
|
|
// let encoder = Encoder::new(&repo);
|
|
|
|
|
// let result = encoder.encode("test").unwrap();
|
|
|
|
|
// assert!(result.output.is_empty());
|
|
|
|
|
// }
|
|
|
|
|
|
|
|
|
|
// #[test]
|
|
|
|
|
// fn test_encode_single_digit() {
|
|
|
|
|
// let dict = get_single_word_dict();
|
|
|
|
|
// let repo = MockRepository::new(dict);
|
|
|
|
|
// let encoder = Encoder::new(&repo).with_batch_size(1);
|
|
|
|
|
// let result = encoder.encode("test").unwrap();
|
|
|
|
|
// assert!(result.output.is_empty());
|
|
|
|
|
// }
|
|
|
|
|
|
|
|
|
|
// ---------- Helpers ----------
|
|
|
|
|
|
|
|
|
|
fn get_single_word_dict() -> Dict { |
|
|
|
|
Dict { |
|
|
|
|
name: "test_dict".to_string(), |
|
|
|
|
entries: HashMap::from([( |
|
|
|
|
1, |
|
|
|
|
DictEntry { |
|
|
|
|
id: Some(1), |
|
|
|
|
text: "test_word_1".to_string(), |
|
|
|
|
metadata: HashMap::new(), |
|
|
|
|
}, |
|
|
|
|
)]), |
|
|
|
|
} |
|
|
|
|
fn mock_decoding(word: &str) -> DecodedValue { |
|
|
|
|
get_test_dec_map().remove(word).unwrap() |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// ---------- Mocks ----------
|
|
|
|
|
|
|
|
|
|
struct MockEncoder { |
|
|
|
|
result: EncodingResult, |
|
|
|
|
fn get_test_words() -> Vec<String> { |
|
|
|
|
vec![ |
|
|
|
|
TEST_WORD_1.to_string(), |
|
|
|
|
TEST_WORD_2.to_string(), |
|
|
|
|
TEST_WORD_3.to_string(), |
|
|
|
|
TEST_WORD_4.to_string(), |
|
|
|
|
] |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
impl MockEncoder { |
|
|
|
|
fn new(result: EncodingResult) -> Self { |
|
|
|
|
MockEncoder { result } |
|
|
|
|
mock! { |
|
|
|
|
pub Decoder {} |
|
|
|
|
impl SystemDecoder for Decoder { |
|
|
|
|
fn decode(&self, word: &str) -> DecodedValue; |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
impl SystemEncoder for MockEncoder { |
|
|
|
|
fn initialize(&self) -> Result<(), EncoderError> { |
|
|
|
|
Ok(()) |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
fn encode(&self, _word: &str) -> Result<EncodingResult, EncoderError> { |
|
|
|
|
Ok(self.result.clone()) |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
#[test] |
|
|
|
|
fn test_insert_words_empty() { |
|
|
|
|
let words = vec![]; |
|
|
|
|
let mut decoder = MockDecoder::new(); |
|
|
|
|
decoder.expect_decode().returning(|_| DecodedValue::new()); |
|
|
|
|
|
|
|
|
|
struct MockRepository { |
|
|
|
|
dict: Dict, |
|
|
|
|
let mut lv_map = LenValueMap::new(); |
|
|
|
|
lv_map.insert_words(words, &decoder).unwrap(); |
|
|
|
|
assert!(lv_map.is_empty()); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
impl MockRepository { |
|
|
|
|
pub fn new() -> Self { |
|
|
|
|
MockRepository { |
|
|
|
|
dict: Dict::new("test_dict".to_string()), |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
pub fn with_single_word_dict(mut self) -> Self { |
|
|
|
|
self.dict = get_single_word_dict(); |
|
|
|
|
self |
|
|
|
|
} |
|
|
|
|
#[test] |
|
|
|
|
fn test_single_word() { |
|
|
|
|
let words = vec![TEST_WORD_1.to_string()]; |
|
|
|
|
let mut decoder = MockDecoder::new(); |
|
|
|
|
decoder |
|
|
|
|
.expect_decode() |
|
|
|
|
.returning(|word| mock_decoding(word)); |
|
|
|
|
|
|
|
|
|
let mut lv_map = LenValueMap::new(); |
|
|
|
|
lv_map.insert_words(words, &decoder).unwrap(); |
|
|
|
|
|
|
|
|
|
let data = lv_map.into_data(); |
|
|
|
|
|
|
|
|
|
assert_eq!(data.len(), 1); |
|
|
|
|
assert!(data.contains_key(&TEST_NUM_1_LEN)); |
|
|
|
|
let data = data.get(&TEST_NUM_1_LEN).unwrap(); |
|
|
|
|
assert!(data.contains_key(&TEST_NUM_1)); |
|
|
|
|
let words = data.get(&TEST_NUM_1).unwrap(); |
|
|
|
|
assert_eq!(words.len(), 1); |
|
|
|
|
assert_eq!(words[0], TEST_WORD_1); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
#[async_trait::async_trait] |
|
|
|
|
impl DictRepository for MockRepository { |
|
|
|
|
async fn create_dict(&self, _name: &str) -> Result<(), RepositoryError> { |
|
|
|
|
Ok(()) |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
async fn save_entries( |
|
|
|
|
&self, |
|
|
|
|
_dict_name: &str, |
|
|
|
|
_entries: &[DictEntry], |
|
|
|
|
) -> Result<(), RepositoryError> { |
|
|
|
|
todo!() |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
async fn fetch_many( |
|
|
|
|
&self, |
|
|
|
|
_name: &str, |
|
|
|
|
limit: Option<u32>, |
|
|
|
|
offset: Option<u32>, |
|
|
|
|
) -> Result<Dict, RepositoryError> { |
|
|
|
|
let offset = offset.unwrap_or(0) as usize; |
|
|
|
|
let limit = limit.unwrap_or(u32::MAX) as usize; |
|
|
|
|
|
|
|
|
|
let mut entries_vec: Vec<_> = self.dict.entries.iter().collect(); |
|
|
|
|
entries_vec.sort_by_key(|&(id, _)| *id); |
|
|
|
|
|
|
|
|
|
let paginated = entries_vec.into_iter().skip(offset).take(limit); |
|
|
|
|
let paginated_map: HashMap<u64, DictEntry> = paginated |
|
|
|
|
.map(|(id, entry)| { |
|
|
|
|
( |
|
|
|
|
*id, |
|
|
|
|
DictEntry { |
|
|
|
|
id: entry.id, |
|
|
|
|
text: entry.text.clone(), |
|
|
|
|
metadata: entry.metadata.clone(), |
|
|
|
|
}, |
|
|
|
|
) |
|
|
|
|
}) |
|
|
|
|
.collect(); |
|
|
|
|
|
|
|
|
|
Ok(Dict { |
|
|
|
|
name: self.dict.name.clone(), |
|
|
|
|
entries: paginated_map, |
|
|
|
|
}) |
|
|
|
|
} |
|
|
|
|
#[test] |
|
|
|
|
fn test_multiple_words() { |
|
|
|
|
let words = get_test_words(); |
|
|
|
|
|
|
|
|
|
let mut decoder = MockDecoder::new(); |
|
|
|
|
decoder |
|
|
|
|
.expect_decode() |
|
|
|
|
.returning(|word| mock_decoding(word)); |
|
|
|
|
|
|
|
|
|
let mut lv_map = LenValueMap::new(); |
|
|
|
|
lv_map.insert_words(words, &decoder).unwrap(); |
|
|
|
|
|
|
|
|
|
let data = lv_map.into_data(); |
|
|
|
|
|
|
|
|
|
assert_eq!(data.len(), 2); // two different lengths
|
|
|
|
|
assert!(data.contains_key(&TEST_NUM_1_LEN)); |
|
|
|
|
assert!(data.contains_key(&TEST_NUM_3_LEN)); |
|
|
|
|
let l2 = data.get(&TEST_NUM_1_LEN).unwrap(); |
|
|
|
|
let l4 = data.get(&TEST_NUM_3_LEN).unwrap(); |
|
|
|
|
|
|
|
|
|
assert_eq!(l2.len(), 2); // two numbers
|
|
|
|
|
assert_eq!(l4.len(), 1); // one number
|
|
|
|
|
assert!(l2.contains_key(&TEST_NUM_1)); |
|
|
|
|
assert!(l2.contains_key(&TEST_NUM_2)); |
|
|
|
|
assert!(l4.contains_key(&TEST_NUM_3)); |
|
|
|
|
|
|
|
|
|
let words = l2.get(&TEST_NUM_1).unwrap(); |
|
|
|
|
assert_eq!(words.len(), 1); |
|
|
|
|
assert_eq!(words[0], TEST_WORD_1); |
|
|
|
|
|
|
|
|
|
let words = l2.get(&TEST_NUM_2).unwrap(); |
|
|
|
|
assert_eq!(words.len(), 1); |
|
|
|
|
assert_eq!(words[0], TEST_WORD_2); |
|
|
|
|
|
|
|
|
|
let words = l4.get(&TEST_NUM_3).unwrap(); |
|
|
|
|
assert_eq!(words.len(), 2); |
|
|
|
|
assert_eq!(words[0], TEST_WORD_3); |
|
|
|
|
assert_eq!(words[1], TEST_WORD_4); |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
|