Browse Source

WIP: encoder

develop-refactor
chodak166 4 months ago
parent
commit
dd1b3ddc99
  1. 15
      lib/src/core/entities.rs
  2. 105
      lib/src/core/sys_major/encoder.rs
  3. 16
      lib/src/core/sys_major/lvmap.rs

15
lib/src/core/entities.rs

@ -1,11 +1,26 @@
use super::errors::CodecError;
use std::num::ParseIntError;
use std::ops::Deref;
use std::{collections::HashMap, u64};
/// A number encoded as a sequence of words
#[derive(Debug, Clone)]
pub struct EncodedValue(Vec<String>);
impl EncodedValue {
pub fn new(data: Vec<String>) -> Self {
EncodedValue(data)
}
}
impl Deref for EncodedValue {
type Target = Vec<String>;
fn deref(&self) -> &Self::Target {
&self.0
}
}
/// The number value can be encoded as many word sets,
/// but decoded as one number. For partial values
/// and dictionary words (reasonable length), we can use

105
lib/src/core/sys_major/encoder.rs

@ -16,6 +16,109 @@ impl SystemEncoder for Encoder {
Ok(())
}
fn encode(&self, word: &str) -> Result<EncodedValue, CodecError> {
todo!()
let size = word.chars().count();
let max_mask: usize = (1 << (size - 1)) - 1;
let indices: Vec<usize> = word.char_indices().map(|(i, _)| i).collect();
let mut results = Vec::with_capacity(max_mask);
for mask in 0..=max_mask {
let mut parts = Vec::new();
let mut last_split = 0;
// Iterate through the mask bits to find where to split
for i in 0..size - 1 {
// Check if the i-th bit is set
if (mask >> i) & 1 == 1 {
// The split corresponds to the byte index of the (i+1)-th character
let split_idx = indices[i + 1];
parts.push(&word[last_split..split_idx]);
last_split = split_idx;
}
}
// Push the remaining part of the string
parts.push(&word[last_split..]);
// Calculate metrics for sorting
let num_parts = parts.len();
// To find the "most equal" size, we minimize the sum of squared lengths.
// (This mathematically minimizes variance without needing floating point math)
let sum_sq_len: usize = parts.iter().map(|p| p.chars().count().pow(2)).sum();
// Construct the final string representation (e.g., "abc|de|fg")
let result_string = parts.join("|");
results.push(Partition {
word: result_string,
num_parts,
sum_sq_len,
});
}
// Ok(EncodedValue::new(words))
// Sort by:
// 1. Fewer parts first (1 part, then 2 parts...)
// 2. Most equal lengths (lower sum of squared lengths is more balanced)
// 3. Lexicographically (for deterministic stability)
results.sort_by(|a, b| {
a.num_parts
.cmp(&b.num_parts)
.then(a.sum_sq_len.cmp(&b.sum_sq_len))
.then(a.word.cmp(&b.word))
});
// Extract just the strings
let words = results.into_iter().map(|p| p.word).collect();
Ok(EncodedValue::new(words))
}
}
// A helper struct to keep the string and its sort metrics together
struct Partition {
word: String,
num_parts: usize,
sum_sq_len: usize,
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_encode_as_single_length_result() {
let mut lvmap = LenValueMap::new();
lvmap.push(3, 123, "test_123");
lvmap.push(3, 345, "test_345_1");
lvmap.push(3, 345, "test_345_2");
lvmap.push(3, 678, "test_678");
let encoder = Encoder::new(lvmap);
let result = encoder.encode("345").unwrap();
assert_eq!(result.len(), 2);
assert!(result.contains(&"test_345_1".into()));
assert!(result.contains(&"test_345_2".into()));
}
#[test]
fn test_encode_as_all_lengths() {
let mut lvmap = LenValueMap::new();
lvmap.push(1, 0, "test_0");
lvmap.push(1, 9, "test_9");
lvmap.push(1, 8, "test_8");
lvmap.push(2, 98, "test_98");
lvmap.push(2, 87, "test_87");
lvmap.push(3, 987, "test_987");
lvmap.push(3, 876, "test_876");
let encoder = Encoder::new(lvmap);
let result = encoder.encode("987").unwrap();
assert_eq!(result.len(), 5);
assert!(result.contains(&"test_987".into()));
assert!(result.contains(&"test_98".into()));
assert!(result.contains(&"test_87".into()));
assert!(result.contains(&"test_9".into()));
assert!(result.contains(&"test_8".into()));
}
}

16
lib/src/core/sys_major/lvmap.rs

@ -1,5 +1,5 @@
use crate::core::{DictRepository, SystemDecoder, entities::DecodedLength, errors::CodecError};
use std::{collections::HashMap, num::ParseIntError};
use std::{collections::HashMap, hash::Hash, num::ParseIntError};
use thiserror::Error;
// We store words by encoded number length, then encoded value
@ -51,6 +51,16 @@ impl LenValueMap {
self.data
}
pub fn push(&mut self, len: u8, num: DecodedNumber, word: impl Into<String>) -> &mut Self {
self.data
.entry(DecodedLength::from(len))
.or_insert_with(HashMap::new)
.entry(num)
.or_insert_with(Vec::new)
.push(word.into());
self
}
pub fn insert_words<I>(
&mut self,
words: I,
@ -75,6 +85,10 @@ impl LenValueMap {
Ok(())
}
pub fn from_data(data: LenValueData) -> Self {
Self { data: data }
}
pub async fn from_dict(
decoder: &impl SystemDecoder,
repo: &impl DictRepository,

Loading…
Cancel
Save