use crate::common::{ entities::{EncodedPart, EncodedSplit, EncodedValue}, errors::CodecError, traits::*, }; use super::lvmap::LenValueMap; #[derive(Debug)] pub struct Encoder { lv_map: LenValueMap, } impl Encoder { pub fn new(lv_map: LenValueMap) -> Self { Encoder { lv_map } } } impl SystemEncoder for Encoder { fn initialize(&self) -> Result<(), CodecError> { Ok(()) } fn encode(&self, input: &str) -> Result { let size = input.chars().count(); let max_mask: usize = (1 << (size - 1)) - 1; let indices: Vec = input.char_indices().map(|(i, _)| i).collect(); let mut results = Vec::with_capacity(max_mask); for mask in 0..=max_mask { let mut parts: Vec = Vec::new(); let mut last_split = input.char_indices().count(); // we go from right to left to start with the longest parts // Iterate through the mask bits to find where to split for i in 0..size - 1 { // Check if the i-th bit is set if (mask >> i) & 1 == 1 { // The split corresponds to the byte index of the (i+1)-th character let split_idx = indices[indices.len() - i - 1]; parts.push(input[split_idx..last_split].to_string()); last_split = split_idx; } } // Push the remaining part of the string parts.push(input[..last_split].to_string()); let mut all_matched = true; let mut split = EncodedSplit::new(); parts.reverse(); for part in &parts { let Ok(num_part) = part.parse::() else { all_matched = false; break; }; let Some(words) = self.lv_map.get(part.len() as u8, num_part) else { all_matched = false; break; }; split.push(EncodedPart { value: num_part, words: words.clone(), }); } if all_matched { results.push(Partition { value: split, // To find the "most equal" size, we minimize the sum of squared lengths. // (This mathematically minimizes variance without needing floating point math) sum_sq_len: parts.iter().map(|p| p.chars().count().pow(2)).sum(), }); } // Calculate metrics for sorting // let num_parts = parts.len(); // // To find the "most equal" size, we minimize the sum of squared lengths. // // (This mathematically minimizes variance without needing floating point math) // let sum_sq_len: usize = parts.iter().map(|p| p.chars().count().pow(2)).sum(); // if let Some(words) = self.lv_map.get(size as u8, input.parse().unwrap()) { // results.push(Partition { // parts: words.clone(), // sum_sq_len, // }); // } } // Ok(EncodedValue::new(words)) // Sort by: // 1. Fewer parts first (1 part, then 2 parts...) // 2. Most equal lengths (lower sum of squared lengths is more balanced) // 3. TODO: Lexicographically (for deterministic stability)? results.sort_by(|a, b| { a.value .len() .cmp(&b.value.len()) .then(a.sum_sq_len.cmp(&b.sum_sq_len)) }); // Extract just the strings let split_results = results.into_iter().map(|p| p.value).collect(); Ok(EncodedValue::new(split_results)) } } // A helper struct to keep the split variant and its sort metrics together struct Partition { value: EncodedSplit, sum_sq_len: usize, } #[cfg(test)] mod tests { use super::*; #[test] fn test_encode_as_single_length_result() { let mut lvmap = LenValueMap::new(); lvmap.push(3, 123, "test_123"); lvmap.push(3, 345, "test_345_1"); lvmap.push(3, 345, "test_345_2"); lvmap.push(3, 678, "test_678"); let encoder = Encoder::new(lvmap); let result = encoder.encode("345").unwrap(); assert_eq!(result.len(), 1); // single split assert_eq!(result[0].len(), 1); // single part assert_eq!(result[0][0].value, 345); assert_eq!(result[0][0].words.len(), 2); // two words assert_eq!(result[0][0].words[0], "test_345_1"); assert_eq!(result[0][0].words[1], "test_345_2"); } #[test] fn test_encode_as_all_lengths() { let mut lvmap = LenValueMap::new(); lvmap.push(1, 0, "test_0"); lvmap.push(1, 9, "test_9"); lvmap.push(1, 8, "test_8"); lvmap.push(1, 7, "test_7"); lvmap.push(2, 98, "test_98"); lvmap.push(2, 87, "test_87"); lvmap.push(3, 987, "test_987"); lvmap.push(3, 876, "test_876"); let encoder = Encoder::new(lvmap); let result = encoder.encode("987").unwrap(); assert_eq!(result.len(), 4); // 987, 98|7, 9|87, 9|8|7 assert_eq!(result[0].len(), 1); // 987 assert_eq!(result[0][0].words.len(), 1); assert_eq!(result[0][0].words[0], "test_987"); assert_eq!(result[1].len(), 2); // 98|7 assert_eq!(result[1][0].words.len(), 1); assert_eq!(result[1][0].words[0], "test_98"); assert_eq!(result[1][1].words.len(), 1); assert_eq!(result[1][1].words[0], "test_7"); assert_eq!(result[2].len(), 2); // 9|87 assert_eq!(result[2][0].words.len(), 1); assert_eq!(result[2][0].words[0], "test_9"); assert_eq!(result[2][1].words.len(), 1); assert_eq!(result[2][1].words[0], "test_87"); assert_eq!(result[3].len(), 3); // 9|8|7 assert_eq!(result[3][0].words.len(), 1); assert_eq!(result[3][0].words[0], "test_9"); assert_eq!(result[3][1].words.len(), 1); assert_eq!(result[3][1].words[0], "test_8"); assert_eq!(result[3][2].words.len(), 1); assert_eq!(result[3][2].words[0], "test_7"); } }