From dd1b3ddc998cd722d8e15c5475f106de54784db3 Mon Sep 17 00:00:00 2001 From: chodak166 Date: Fri, 26 Dec 2025 18:40:30 +0100 Subject: [PATCH] WIP: encoder --- lib/src/core/entities.rs | 15 +++++ lib/src/core/sys_major/encoder.rs | 105 +++++++++++++++++++++++++++++- lib/src/core/sys_major/lvmap.rs | 16 ++++- 3 files changed, 134 insertions(+), 2 deletions(-) diff --git a/lib/src/core/entities.rs b/lib/src/core/entities.rs index 141284b..68e38bd 100644 --- a/lib/src/core/entities.rs +++ b/lib/src/core/entities.rs @@ -1,11 +1,26 @@ use super::errors::CodecError; use std::num::ParseIntError; +use std::ops::Deref; use std::{collections::HashMap, u64}; /// A number encoded as a sequence of words #[derive(Debug, Clone)] pub struct EncodedValue(Vec); +impl EncodedValue { + pub fn new(data: Vec) -> Self { + EncodedValue(data) + } +} + +impl Deref for EncodedValue { + type Target = Vec; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + /// The number value can be encoded as many word sets, /// but decoded as one number. For partial values /// and dictionary words (reasonable length), we can use diff --git a/lib/src/core/sys_major/encoder.rs b/lib/src/core/sys_major/encoder.rs index fa1c0ef..fbe18f5 100644 --- a/lib/src/core/sys_major/encoder.rs +++ b/lib/src/core/sys_major/encoder.rs @@ -16,6 +16,109 @@ impl SystemEncoder for Encoder { Ok(()) } fn encode(&self, word: &str) -> Result { - todo!() + let size = word.chars().count(); + let max_mask: usize = (1 << (size - 1)) - 1; + + let indices: Vec = word.char_indices().map(|(i, _)| i).collect(); + let mut results = Vec::with_capacity(max_mask); + + for mask in 0..=max_mask { + let mut parts = Vec::new(); + let mut last_split = 0; + + // Iterate through the mask bits to find where to split + for i in 0..size - 1 { + // Check if the i-th bit is set + if (mask >> i) & 1 == 1 { + // The split corresponds to the byte index of the (i+1)-th character + let split_idx = indices[i + 1]; + parts.push(&word[last_split..split_idx]); + last_split = split_idx; + } + } + // Push the remaining part of the string + parts.push(&word[last_split..]); + + // Calculate metrics for sorting + let num_parts = parts.len(); + + // To find the "most equal" size, we minimize the sum of squared lengths. + // (This mathematically minimizes variance without needing floating point math) + let sum_sq_len: usize = parts.iter().map(|p| p.chars().count().pow(2)).sum(); + + // Construct the final string representation (e.g., "abc|de|fg") + let result_string = parts.join("|"); + + results.push(Partition { + word: result_string, + num_parts, + sum_sq_len, + }); + } + + // Ok(EncodedValue::new(words)) + // Sort by: + // 1. Fewer parts first (1 part, then 2 parts...) + // 2. Most equal lengths (lower sum of squared lengths is more balanced) + // 3. Lexicographically (for deterministic stability) + results.sort_by(|a, b| { + a.num_parts + .cmp(&b.num_parts) + .then(a.sum_sq_len.cmp(&b.sum_sq_len)) + .then(a.word.cmp(&b.word)) + }); + + // Extract just the strings + let words = results.into_iter().map(|p| p.word).collect(); + Ok(EncodedValue::new(words)) + } +} + +// A helper struct to keep the string and its sort metrics together +struct Partition { + word: String, + num_parts: usize, + sum_sq_len: usize, +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_encode_as_single_length_result() { + let mut lvmap = LenValueMap::new(); + lvmap.push(3, 123, "test_123"); + lvmap.push(3, 345, "test_345_1"); + lvmap.push(3, 345, "test_345_2"); + lvmap.push(3, 678, "test_678"); + let encoder = Encoder::new(lvmap); + let result = encoder.encode("345").unwrap(); + + assert_eq!(result.len(), 2); + assert!(result.contains(&"test_345_1".into())); + assert!(result.contains(&"test_345_2".into())); + } + + #[test] + fn test_encode_as_all_lengths() { + let mut lvmap = LenValueMap::new(); + lvmap.push(1, 0, "test_0"); + lvmap.push(1, 9, "test_9"); + lvmap.push(1, 8, "test_8"); + lvmap.push(2, 98, "test_98"); + lvmap.push(2, 87, "test_87"); + lvmap.push(3, 987, "test_987"); + lvmap.push(3, 876, "test_876"); + + let encoder = Encoder::new(lvmap); + let result = encoder.encode("987").unwrap(); + + assert_eq!(result.len(), 5); + assert!(result.contains(&"test_987".into())); + assert!(result.contains(&"test_98".into())); + assert!(result.contains(&"test_87".into())); + assert!(result.contains(&"test_9".into())); + assert!(result.contains(&"test_8".into())); } } diff --git a/lib/src/core/sys_major/lvmap.rs b/lib/src/core/sys_major/lvmap.rs index 2ec83fc..e9ed4c1 100644 --- a/lib/src/core/sys_major/lvmap.rs +++ b/lib/src/core/sys_major/lvmap.rs @@ -1,5 +1,5 @@ use crate::core::{DictRepository, SystemDecoder, entities::DecodedLength, errors::CodecError}; -use std::{collections::HashMap, num::ParseIntError}; +use std::{collections::HashMap, hash::Hash, num::ParseIntError}; use thiserror::Error; // We store words by encoded number length, then encoded value @@ -51,6 +51,16 @@ impl LenValueMap { self.data } + pub fn push(&mut self, len: u8, num: DecodedNumber, word: impl Into) -> &mut Self { + self.data + .entry(DecodedLength::from(len)) + .or_insert_with(HashMap::new) + .entry(num) + .or_insert_with(Vec::new) + .push(word.into()); + self + } + pub fn insert_words( &mut self, words: I, @@ -75,6 +85,10 @@ impl LenValueMap { Ok(()) } + pub fn from_data(data: LenValueData) -> Self { + Self { data: data } + } + pub async fn from_dict( decoder: &impl SystemDecoder, repo: &impl DictRepository,