|
|
|
@ -1,10 +1,6 @@ |
|
|
|
use std::collections::HashMap; |
|
|
|
use crate::core::{DictRepository, SystemDecoder, entities::DecodedLength, errors::CodecError}; |
|
|
|
|
|
|
|
use std::{collections::HashMap, num::ParseIntError}; |
|
|
|
use anyhow::Error; |
|
|
|
use thiserror::Error; |
|
|
|
|
|
|
|
|
|
|
|
use crate::core::errors::RepositoryError; |
|
|
|
|
|
|
|
use crate::core::traits::DecodedValue; |
|
|
|
|
|
|
|
use crate::core::{DictRepository, SystemDecoder}; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// We store words by encoded number length, then encoded value
|
|
|
|
// We store words by encoded number length, then encoded value
|
|
|
|
// Example:
|
|
|
|
// Example:
|
|
|
|
@ -22,68 +18,62 @@ use crate::core::{DictRepository, SystemDecoder}; |
|
|
|
// Words are fetched from DictRepository in batches
|
|
|
|
// Words are fetched from DictRepository in batches
|
|
|
|
const DEFAULT_DICT_BATCH_SIZE: usize = 100; |
|
|
|
const DEFAULT_DICT_BATCH_SIZE: usize = 100; |
|
|
|
|
|
|
|
|
|
|
|
type ValueLength = u8; |
|
|
|
#[derive(Error, Debug)] |
|
|
|
type Value = u64; |
|
|
|
pub enum LenValueMapError { |
|
|
|
pub type LenValueData = HashMap<ValueLength, HashMap<Value, Vec<String>>>; |
|
|
|
#[error("Value parsing error: {0}")] |
|
|
|
|
|
|
|
Parse(#[from] ParseIntError), |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#[error(transparent)] |
|
|
|
|
|
|
|
Codec(#[from] CodecError), |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
#[derive(Debug)] |
|
|
|
type DecodedNumber = u64; |
|
|
|
|
|
|
|
pub type LenValueData = HashMap<DecodedLength, HashMap<DecodedNumber, Vec<String>>>; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#[derive(Debug, Default, Clone)] |
|
|
|
pub struct LenValueMap { |
|
|
|
pub struct LenValueMap { |
|
|
|
data: LenValueData, |
|
|
|
data: LenValueData, |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
impl LenValueMap { |
|
|
|
impl LenValueMap { |
|
|
|
pub fn new() -> Self { |
|
|
|
pub fn new() -> Self { |
|
|
|
LenValueMap { |
|
|
|
Self::default() |
|
|
|
data: LenValueData::new(), |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
pub fn with_data(mut self, data: LenValueData) -> Self { |
|
|
|
pub fn is_empty(&self) -> bool { |
|
|
|
self.data = data; |
|
|
|
self.data.is_empty() |
|
|
|
self |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
pub fn data(&self) -> &LenValueData { |
|
|
|
|
|
|
|
&self.data |
|
|
|
|
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
pub fn into_data(self) -> LenValueData { |
|
|
|
pub fn into_data(self) -> LenValueData { |
|
|
|
self.data |
|
|
|
self.data |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
pub fn insert_words( |
|
|
|
pub fn insert_words<I>( |
|
|
|
&mut self, |
|
|
|
&mut self, |
|
|
|
words: Vec<String>, |
|
|
|
words: I, |
|
|
|
decoder: &impl SystemDecoder, |
|
|
|
decoder: &dyn SystemDecoder, |
|
|
|
) -> Result<(), Error> { |
|
|
|
) -> Result<(), LenValueMapError> |
|
|
|
|
|
|
|
where |
|
|
|
|
|
|
|
I: IntoIterator<Item = String>, |
|
|
|
|
|
|
|
{ |
|
|
|
for word in words { |
|
|
|
for word in words { |
|
|
|
if word.is_empty() { |
|
|
|
if word.is_empty() { |
|
|
|
continue; |
|
|
|
continue; |
|
|
|
} |
|
|
|
} |
|
|
|
let decoded = decoder.decode(&word); |
|
|
|
let decoded = decoder.decode(&word)?; |
|
|
|
let int_value = decoded.parse::<u64>()?; |
|
|
|
|
|
|
|
let len = decoded.len() as u8; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if let Some(len_item) = self.data.get_mut(&len) { |
|
|
|
|
|
|
|
if let Some(value_item) = len_item.get_mut(&int_value) { |
|
|
|
|
|
|
|
value_item.push(word); |
|
|
|
|
|
|
|
} else { |
|
|
|
|
|
|
|
len_item.insert(int_value, vec![word]); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
} else { |
|
|
|
|
|
|
|
self.data |
|
|
|
self.data |
|
|
|
.insert(len, HashMap::from([(int_value, vec![word])])); |
|
|
|
.entry(decoded.value_len()?) |
|
|
|
} |
|
|
|
.or_default() |
|
|
|
|
|
|
|
.entry(decoded.parse()?) |
|
|
|
|
|
|
|
.or_default() |
|
|
|
|
|
|
|
.push(word); |
|
|
|
} |
|
|
|
} |
|
|
|
Ok(()) |
|
|
|
Ok(()) |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
pub fn from_dict(decoder: &impl SystemDecoder, repo: &impl DictRepository) -> Self { |
|
|
|
pub async fn from_dict(decoder: &impl SystemDecoder, repo: &impl DictRepository) -> Self { |
|
|
|
Self::build(decoder, repo, DEFAULT_DICT_BATCH_SIZE) // is that common approach?
|
|
|
|
Self::build(decoder, repo, DEFAULT_DICT_BATCH_SIZE) |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
pub fn is_empty(&self) -> bool { |
|
|
|
|
|
|
|
self.data.is_empty() |
|
|
|
|
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
fn build(decoder: &impl SystemDecoder, repo: &impl DictRepository, batch_size: usize) -> Self { |
|
|
|
fn build(decoder: &impl SystemDecoder, repo: &impl DictRepository, batch_size: usize) -> Self { |
|
|
|
@ -107,20 +97,26 @@ mod tests { |
|
|
|
const TEST_NUM_1: u64 = 12; |
|
|
|
const TEST_NUM_1: u64 = 12; |
|
|
|
const TEST_NUM_2: u64 = 34; |
|
|
|
const TEST_NUM_2: u64 = 34; |
|
|
|
const TEST_NUM_3: u64 = 9876; |
|
|
|
const TEST_NUM_3: u64 = 9876; |
|
|
|
const TEST_NUM_1_LEN: u8 = 2; |
|
|
|
const TEST_NUM_1_LEN: DecodedLength = DecodedLength::from(2); |
|
|
|
const TEST_NUM_3_LEN: u8 = 4; |
|
|
|
const TEST_NUM_3_LEN: DecodedLength = DecodedLength::from(4); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
fn decoded_value(n: u64) -> DecodedValue { |
|
|
|
|
|
|
|
DecodedValue::new(n.to_string()).unwrap() |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
fn get_test_dec_map() -> HashMap<String, DecodedValue> { |
|
|
|
fn get_test_dec_map() -> HashMap<String, DecodedValue> { |
|
|
|
HashMap::from([ |
|
|
|
HashMap::from([ |
|
|
|
(TEST_WORD_1.to_string(), TEST_NUM_1.to_string()), |
|
|
|
(TEST_WORD_1.to_string(), decoded_value(TEST_NUM_1)), |
|
|
|
(TEST_WORD_2.to_string(), TEST_NUM_2.to_string()), |
|
|
|
(TEST_WORD_2.to_string(), decoded_value(TEST_NUM_2)), |
|
|
|
(TEST_WORD_3.to_string(), TEST_NUM_3.to_string()), |
|
|
|
(TEST_WORD_3.to_string(), decoded_value(TEST_NUM_3)), |
|
|
|
(TEST_WORD_4.to_string(), TEST_NUM_3.to_string()), |
|
|
|
(TEST_WORD_4.to_string(), decoded_value(TEST_NUM_3)), |
|
|
|
]) |
|
|
|
]) |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
fn mock_decoding(word: &str) -> DecodedValue { |
|
|
|
fn mock_decoding(word: &str) -> Result<DecodedValue, CodecError> { |
|
|
|
get_test_dec_map().remove(word).unwrap() |
|
|
|
get_test_dec_map() |
|
|
|
|
|
|
|
.remove(word) |
|
|
|
|
|
|
|
.ok_or_else(|| CodecError::UnexpectedError("".to_string())) |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
fn get_test_words() -> Vec<String> { |
|
|
|
fn get_test_words() -> Vec<String> { |
|
|
|
@ -135,19 +131,8 @@ mod tests { |
|
|
|
mock! { |
|
|
|
mock! { |
|
|
|
pub Decoder {} |
|
|
|
pub Decoder {} |
|
|
|
impl SystemDecoder for Decoder { |
|
|
|
impl SystemDecoder for Decoder { |
|
|
|
fn decode(&self, word: &str) -> DecodedValue; |
|
|
|
fn decode(&self, word: &str) -> Result<DecodedValue, CodecError>; |
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
#[test] |
|
|
|
|
|
|
|
fn test_insert_words_empty() { |
|
|
|
|
|
|
|
let words = vec![]; |
|
|
|
|
|
|
|
let mut decoder = MockDecoder::new(); |
|
|
|
|
|
|
|
decoder.expect_decode().returning(|_| DecodedValue::new()); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
let mut lv_map = LenValueMap::new(); |
|
|
|
|
|
|
|
lv_map.insert_words(words, &decoder).unwrap(); |
|
|
|
|
|
|
|
assert!(lv_map.is_empty()); |
|
|
|
|
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
#[test] |
|
|
|
#[test] |
|
|
|
|