Browse Source

WIP: lv_map refactor & tests

develop-refactor
chodak166 5 months ago
parent
commit
6189b4318b
  1. 96
      lib/src/core/entities.rs
  2. 28
      lib/src/core/errors.rs
  3. 7
      lib/src/core/sys_major/decoder.rs
  4. 26
      lib/src/core/sys_major/decoder_tests.rs
  5. 8
      lib/src/core/sys_major/encoder.rs
  6. 115
      lib/src/core/sys_major/lvmap.rs
  7. 24
      lib/src/core/sys_major/rules_pl.rs
  8. 18
      lib/src/core/traits.rs
  9. 4
      lib/src/presentation/cli/commands/decode.rs

96
lib/src/core/entities.rs

@ -1,4 +1,81 @@
use std::collections::HashMap;
use super::errors::CodecError;
use std::num::ParseIntError;
use std::{collections::HashMap, u64};
/// A number encoded as a sequence of words
#[derive(Debug, Clone)]
pub struct EncodedValue(Vec<String>);
/// The number value can be encoded as many word sets,
/// but decoded as one number. For partial values
/// and dictionary words (reasonable length), we can use
/// u64 (20-digit number), but the whole input text can
/// be longer than 20 digits, so we operate on String (<= 255).
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct DecodedValue(String);
impl DecodedValue {
pub fn new(value: String) -> Result<Self, CodecError> {
if value.len() > u8::MAX as usize {
Err(CodecError::TextTooLong(value.len()))
} else {
Ok(Self(value))
}
}
pub fn as_str(&self) -> &str {
&self.0
}
pub fn parse(&self) -> Result<u64, ParseIntError> {
self.0.parse()
}
pub fn len(&self) -> usize {
self.0.len()
}
pub fn value_len(&self) -> Result<DecodedLength, CodecError> {
if self.len() == 0 {
return Err(CodecError::EmptyValue);
}
DecodedLength::try_from(self.len())
}
}
impl PartialEq<&str> for DecodedValue {
fn eq(&self, other: &&str) -> bool {
&self.0 == *other
}
}
impl PartialEq<DecodedValue> for &str {
fn eq(&self, other: &DecodedValue) -> bool {
*self == &other.0
}
}
#[derive(Debug, Clone, Copy, Eq, PartialEq, Hash)]
pub struct DecodedLength(u8);
impl DecodedLength {
pub const fn from(value: u8) -> Self {
Self(value)
}
}
impl TryFrom<usize> for DecodedLength {
type Error = CodecError;
fn try_from(value: usize) -> Result<Self, CodecError> {
if value > u8::MAX as usize {
Err(CodecError::ValueLimitExceeded(value))
} else {
Ok(Self(value as u8))
}
}
}
// --- Dictionary ---
pub type DictEntryId = u64;
@ -37,20 +114,3 @@ impl Dict {
self.entries.insert(entry.id.unwrap(), entry);
}
}
#[derive(Debug, Clone)]
pub struct EncodedItem {
pub entry: DictEntry,
}
impl EncodedItem {
pub fn value(&self) -> &str {
&self.entry.text
}
}
#[derive(Debug, Clone)]
pub struct EncodingResult {
pub input: String,
pub output: Vec<EncodedItem>,
}

28
lib/src/core/errors.rs

@ -4,24 +4,28 @@ use thiserror::Error;
pub enum RepositoryError {
#[error("Data source connection failed")]
ConnectionFailed,
#[error("Dictionary '{0}' not found")]
#[error("'{0}' not found")]
NotFound(String),
#[error("Storage error: {0}")]
StorageError(String),
}
#[derive(Error, Debug)]
pub enum EncoderError {
#[error("Encoder initialization failed")]
#[derive(Debug, Error)]
pub enum CodecError {
#[error("text too long: {0} bytes")]
TextTooLong(usize),
#[error("value too large: {0}/255")]
ValueLimitExceeded(usize),
#[error("operation not allowed on empty value")]
EmptyValue,
#[error("initialization failed")]
InitializationFailed,
#[error("Unexpected error: {0}")]
UnexpectedError(String),
}
#[derive(Error, Debug)]
pub enum DecoderError {
#[error("Decoder input error")]
InputError,
#[error("Unexpected error: {0}")]
#[error("unexpected error: {0}")]
UnexpectedError(String),
}

7
lib/src/core/sys_major/decoder.rs

@ -1,4 +1,4 @@
use crate::core::traits::SystemDecoder;
use crate::core::{entities::DecodedValue, errors::CodecError, traits::SystemDecoder};
#[derive(Debug, Default, Clone)]
pub struct Rule {
@ -108,7 +108,7 @@ impl Decoder {
}
impl SystemDecoder for Decoder {
fn decode(&self, word: &str) -> String {
fn decode(&self, word: &str) -> Result<DecodedValue, CodecError> {
let mut matches: RuleMatches = self
.rules
.iter()
@ -117,6 +117,7 @@ impl SystemDecoder for Decoder {
matches.sort_by_key(|&(pos, _)| pos);
dbg!(&matches);
matches.into_iter().map(|(_, value)| value).collect()
let num_str: String = matches.into_iter().map(|(_, value)| value).collect();
DecodedValue::new(num_str)
}
}

26
lib/src/core/sys_major/decoder_tests.rs

@ -45,90 +45,90 @@ mod tests {
#[test]
fn test_single_symbol_encoding_only_before_only_after_matched() {
let decoder = Decoder::new(create_single_rules());
let output = decoder.decode("ABC");
let output = decoder.decode("ABC").unwrap();
assert_eq!(output, "2")
}
#[test]
fn test_double_symbol_encoding_only_before_only_after_matched() {
let decoder = Decoder::new(create_double_rules());
let output = decoder.decode("ABCDEF");
let output = decoder.decode("ABCDEF").unwrap();
assert_eq!(output, "2")
}
#[test]
fn test_single_symbol_encoding_only_before_not_matched_with_other() {
let decoder = Decoder::new(create_single_rules());
let output = decoder.decode("DBC");
let output = decoder.decode("DBC").unwrap();
assert_eq!(output, "")
}
#[test]
fn test_double_symbol_encoding_only_before_not_matched_with_other() {
let decoder = Decoder::new(create_double_rules());
let output = decoder.decode("AACDEE");
let output = decoder.decode("AACDEE").unwrap();
assert_eq!(output, "")
}
#[test]
fn test_case_insensitivity() {
let decoder = Decoder::new(create_double_rules());
let output = decoder.decode("abcdef");
let output = decoder.decode("abcdef").unwrap();
assert_eq!(output, "2")
}
#[test]
fn test_single_symbol_encoding_only_before_not_matched_with_empty() {
let decoder = Decoder::new(create_single_rules());
let output = decoder.decode("BC");
let output = decoder.decode("BC").unwrap();
assert_eq!(output, "")
}
#[test]
fn test_single_symbol_encoding_only_before_not_matched_with_not_before() {
let decoder = Decoder::new(create_single_rules());
let output = decoder.decode("XBC");
let output = decoder.decode("XBC").unwrap();
assert_eq!(output, "")
}
#[test]
fn test_single_symbol_encoding_only_after_not_matched_with_other() {
let decoder = Decoder::new(create_single_rules());
let output = decoder.decode("ABD");
let output = decoder.decode("ABD").unwrap();
assert_eq!(output, "")
}
#[test]
fn test_single_symbol_encoding_only_after_not_matched_with_empty() {
let decoder = Decoder::new(create_single_rules());
let output = decoder.decode("AB");
let output = decoder.decode("AB").unwrap();
assert_eq!(output, "")
}
#[test]
fn test_single_symbol_encoding_only_after_not_matched_with_not_after() {
let decoder = Decoder::new(create_single_rules());
let output = decoder.decode("ABY");
let output = decoder.decode("ABY").unwrap();
assert_eq!(output, "")
}
#[test]
fn test_single_symbol_encoding_empty_before_after_matched_with_empty() {
let decoder = Decoder::new(create_single_rules_min());
let output = decoder.decode("B");
let output = decoder.decode("B").unwrap();
assert_eq!(output, "2")
}
#[test]
fn test_single_symbol_encoding_empty_before_after_matched_with_others() {
let decoder = Decoder::new(create_single_rules_min());
let output = decoder.decode("AXBYC");
let output = decoder.decode("AXBYC").unwrap();
assert_eq!(output, "2")
}
#[test]
fn test_encoding_multiple_phonemes() {
let decoder = Decoder::new(create_double_rules());
let output = decoder.decode("VvmNabCd33mn00CD22cdefmn");
let output = decoder.decode("VvmNabCd33mn00CD22cdefmn").unwrap();
assert_eq!(output, "32323")
}
}

8
lib/src/core/sys_major/encoder.rs

@ -1,6 +1,4 @@
use crate::core::{
entities::EncodingResult, errors::EncoderError, sys_major::LenValueMap, traits::*,
};
use crate::core::{entities::EncodedValue, errors::CodecError, sys_major::LenValueMap, traits::*};
#[derive(Debug)]
pub struct Encoder {
@ -14,10 +12,10 @@ impl Encoder {
}
impl SystemEncoder for Encoder {
fn initialize(&self) -> Result<(), EncoderError> {
fn initialize(&self) -> Result<(), CodecError> {
Ok(())
}
fn encode(&self, word: &str) -> Result<EncodingResult, EncoderError> {
fn encode(&self, word: &str) -> Result<EncodedValue, CodecError> {
todo!()
}
}

115
lib/src/core/sys_major/lvmap.rs

@ -1,10 +1,6 @@
use std::collections::HashMap;
use anyhow::Error;
use crate::core::errors::RepositoryError;
use crate::core::traits::DecodedValue;
use crate::core::{DictRepository, SystemDecoder};
use crate::core::{DictRepository, SystemDecoder, entities::DecodedLength, errors::CodecError};
use std::{collections::HashMap, num::ParseIntError};
use thiserror::Error;
// We store words by encoded number length, then encoded value
// Example:
@ -22,68 +18,62 @@ use crate::core::{DictRepository, SystemDecoder};
// Words are fetched from DictRepository in batches
const DEFAULT_DICT_BATCH_SIZE: usize = 100;
type ValueLength = u8;
type Value = u64;
pub type LenValueData = HashMap<ValueLength, HashMap<Value, Vec<String>>>;
#[derive(Error, Debug)]
pub enum LenValueMapError {
#[error("Value parsing error: {0}")]
Parse(#[from] ParseIntError),
#[error(transparent)]
Codec(#[from] CodecError),
}
type DecodedNumber = u64;
pub type LenValueData = HashMap<DecodedLength, HashMap<DecodedNumber, Vec<String>>>;
#[derive(Debug)]
#[derive(Debug, Default, Clone)]
pub struct LenValueMap {
data: LenValueData,
}
impl LenValueMap {
pub fn new() -> Self {
LenValueMap {
data: LenValueData::new(),
}
Self::default()
}
pub fn with_data(mut self, data: LenValueData) -> Self {
self.data = data;
self
}
pub fn data(&self) -> &LenValueData {
&self.data
pub fn is_empty(&self) -> bool {
self.data.is_empty()
}
pub fn into_data(self) -> LenValueData {
self.data
}
pub fn insert_words(
pub fn insert_words<I>(
&mut self,
words: Vec<String>,
decoder: &impl SystemDecoder,
) -> Result<(), Error> {
words: I,
decoder: &dyn SystemDecoder,
) -> Result<(), LenValueMapError>
where
I: IntoIterator<Item = String>,
{
for word in words {
if word.is_empty() {
continue;
}
let decoded = decoder.decode(&word);
let int_value = decoded.parse::<u64>()?;
let len = decoded.len() as u8;
if let Some(len_item) = self.data.get_mut(&len) {
if let Some(value_item) = len_item.get_mut(&int_value) {
value_item.push(word);
} else {
len_item.insert(int_value, vec![word]);
}
} else {
self.data
.insert(len, HashMap::from([(int_value, vec![word])]));
}
let decoded = decoder.decode(&word)?;
self.data
.entry(decoded.value_len()?)
.or_default()
.entry(decoded.parse()?)
.or_default()
.push(word);
}
Ok(())
}
pub fn from_dict(decoder: &impl SystemDecoder, repo: &impl DictRepository) -> Self {
Self::build(decoder, repo, DEFAULT_DICT_BATCH_SIZE) // is that common approach?
}
pub fn is_empty(&self) -> bool {
self.data.is_empty()
pub async fn from_dict(decoder: &impl SystemDecoder, repo: &impl DictRepository) -> Self {
Self::build(decoder, repo, DEFAULT_DICT_BATCH_SIZE)
}
fn build(decoder: &impl SystemDecoder, repo: &impl DictRepository, batch_size: usize) -> Self {
@ -107,20 +97,26 @@ mod tests {
const TEST_NUM_1: u64 = 12;
const TEST_NUM_2: u64 = 34;
const TEST_NUM_3: u64 = 9876;
const TEST_NUM_1_LEN: u8 = 2;
const TEST_NUM_3_LEN: u8 = 4;
const TEST_NUM_1_LEN: DecodedLength = DecodedLength::from(2);
const TEST_NUM_3_LEN: DecodedLength = DecodedLength::from(4);
fn decoded_value(n: u64) -> DecodedValue {
DecodedValue::new(n.to_string()).unwrap()
}
fn get_test_dec_map() -> HashMap<String, DecodedValue> {
HashMap::from([
(TEST_WORD_1.to_string(), TEST_NUM_1.to_string()),
(TEST_WORD_2.to_string(), TEST_NUM_2.to_string()),
(TEST_WORD_3.to_string(), TEST_NUM_3.to_string()),
(TEST_WORD_4.to_string(), TEST_NUM_3.to_string()),
(TEST_WORD_1.to_string(), decoded_value(TEST_NUM_1)),
(TEST_WORD_2.to_string(), decoded_value(TEST_NUM_2)),
(TEST_WORD_3.to_string(), decoded_value(TEST_NUM_3)),
(TEST_WORD_4.to_string(), decoded_value(TEST_NUM_3)),
])
}
fn mock_decoding(word: &str) -> DecodedValue {
get_test_dec_map().remove(word).unwrap()
fn mock_decoding(word: &str) -> Result<DecodedValue, CodecError> {
get_test_dec_map()
.remove(word)
.ok_or_else(|| CodecError::UnexpectedError("".to_string()))
}
fn get_test_words() -> Vec<String> {
@ -135,21 +131,10 @@ mod tests {
mock! {
pub Decoder {}
impl SystemDecoder for Decoder {
fn decode(&self, word: &str) -> DecodedValue;
fn decode(&self, word: &str) -> Result<DecodedValue, CodecError>;
}
}
#[test]
fn test_insert_words_empty() {
let words = vec![];
let mut decoder = MockDecoder::new();
decoder.expect_decode().returning(|_| DecodedValue::new());
let mut lv_map = LenValueMap::new();
lv_map.insert_words(words, &decoder).unwrap();
assert!(lv_map.is_empty());
}
#[test]
fn test_single_word() {
let words = vec![TEST_WORD_1.to_string()];

24
lib/src/core/sys_major/rules_pl.rs

@ -139,84 +139,84 @@ mod tests {
#[test]
fn test_major_dict_pl_decode_0_1() {
let decoder = Decoder::new(get_rules());
let output = decoder.decode("SZSCZ");
let output = decoder.decode("SZSCZ").unwrap();
assert_eq!(output, "0")
}
#[test]
fn test_major_dict_pl_decode_0_2() {
let decoder = Decoder::new(get_rules());
let output = decoder.decode("SZSICZ");
let output = decoder.decode("SZSICZ").unwrap();
assert_eq!(output, "")
}
#[test]
fn test_major_dict_pl_decode_0_3() {
let decoder = Decoder::new(get_rules());
let output = decoder.decode("SZCZRZZCZDZSZ");
let output = decoder.decode("SZCZRZZCZDZSZ").unwrap();
assert_eq!(output, "0")
}
#[test]
fn test_major_dict_pl_decode_0_4() {
let decoder = Decoder::new(get_rules());
let output = decoder.decode("SZCZRZZICZDZSZ");
let output = decoder.decode("SZCZRZZICZDZSZ").unwrap();
assert_eq!(output, "")
}
#[test]
fn test_major_dict_pl_decode_1_1() {
let decoder = Decoder::new(get_rules());
let output = decoder.decode("SZTCZ");
let output = decoder.decode("SZTCZ").unwrap();
assert_eq!(output, "1")
}
#[test]
fn test_major_dict_pl_decode_1_2() {
let decoder = Decoder::new(get_rules());
let output = decoder.decode("DZDŻDŹDDZDŻDŹ");
let output = decoder.decode("DZDŻDŹDDZDŻDŹ").unwrap();
assert_eq!(output, "1")
}
#[test]
fn test_major_dict_pl_decode_1_3() {
let decoder = Decoder::new(get_rules());
let output = decoder.decode("DZDŻDŹDZDZDŻDŹ");
let output = decoder.decode("DZDŻDŹDZDZDŻDŹ").unwrap();
assert_eq!(output, "")
}
#[test]
fn test_major_dict_pl_decode_2_1() {
let decoder = Decoder::new(get_rules());
let output = decoder.decode("NINNI");
let output = decoder.decode("NINNI").unwrap();
assert_eq!(output, "2")
}
#[test]
fn test_major_dict_pl_decode_2_2() {
let decoder = Decoder::new(get_rules());
let output = decoder.decode("NININI");
let output = decoder.decode("NININI").unwrap();
assert_eq!(output, "")
}
#[test]
fn test_major_dict_pl_decode_4_1() {
let decoder = Decoder::new(get_rules());
let output = decoder.decode("RZRRZ");
let output = decoder.decode("RZRRZ").unwrap();
assert_eq!(output, "4")
}
#[test]
fn test_major_dict_pl_decode_4_2() {
let decoder = Decoder::new(get_rules());
let output = decoder.decode("RZRZRZ");
let output = decoder.decode("RZRZRZ").unwrap();
assert_eq!(output, "")
}
#[test]
fn test_major_dict_pl_decode_full_1() {
let decoder = Decoder::new(get_rules());
let output = decoder.decode("ATADANAMARALAJAKAGAFAWAPABA");
let output = decoder.decode("ATADANAMARALAJAKAGAFAWAPABA").unwrap();
assert_eq!(output, "1123456778899")
}
}

18
lib/src/core/traits.rs

@ -1,22 +1,16 @@
use crate::core::entities::EncodingResult;
use crate::core::errors::EncoderError;
use crate::core::entities::EncodedValue;
use crate::core::errors::CodecError;
use super::entities::{Dict, DictEntry};
use super::entities::{DecodedValue, Dict, DictEntry};
use super::errors::RepositoryError;
/// The number value can be encoded as many word sets,
/// but decoded as one number. For partial values, we can use
/// u64, but for the whole decoded value that may be very long,
/// we need a string.
pub type DecodedValue = String;
pub trait SystemDecoder {
fn decode(&self, word: &str) -> DecodedValue;
fn decode(&self, word: &str) -> Result<DecodedValue, CodecError>;
}
pub trait SystemEncoder {
fn initialize(&self) -> Result<(), EncoderError>;
fn encode(&self, word: &str) -> Result<EncodingResult, EncoderError>;
fn initialize(&self) -> Result<(), CodecError>;
fn encode(&self, word: &str) -> Result<EncodedValue, CodecError>;
}
#[async_trait::async_trait]

4
lib/src/presentation/cli/commands/decode.rs

@ -5,6 +5,6 @@ use tracing::debug;
pub async fn run(config: DecoderConfig) {
debug!("Running greeter with config {:?}", config);
let decoder = system::create_decoder(&config.system);
let result = decoder.decode(&config.input);
println!("{}", result);
let result = decoder.decode(&config.input).unwrap();
println!("{}", result.as_str());
}

Loading…
Cancel
Save