You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

179 lines
6.2 KiB

use crate::common::{
entities::{EncodedPart, EncodedSplit, EncodedValue},
errors::CodecError,
traits::*,
};
use super::lvmap::LenValueMap;
#[derive(Debug)]
pub struct Encoder {
lv_map: LenValueMap,
}
impl Encoder {
pub fn new(lv_map: LenValueMap) -> Self {
Encoder { lv_map }
}
}
impl SystemEncoder for Encoder {
fn initialize(&self) -> Result<(), CodecError> {
Ok(())
}
fn encode(&self, input: &str) -> Result<EncodedValue, CodecError> {
let size = input.chars().count();
let max_mask: usize = (1 << (size - 1)) - 1;
let indices: Vec<usize> = input.char_indices().map(|(i, _)| i).collect();
let mut results = Vec::with_capacity(max_mask);
for mask in 0..=max_mask {
let mut parts: Vec<String> = Vec::new();
let mut last_split = input.char_indices().count(); // we go from right to left to start with the longest parts
// Iterate through the mask bits to find where to split
for i in 0..size - 1 {
// Check if the i-th bit is set
if (mask >> i) & 1 == 1 {
// The split corresponds to the byte index of the (i+1)-th character
let split_idx = indices[indices.len() - i - 1];
parts.push(input[split_idx..last_split].to_string());
last_split = split_idx;
}
}
// Push the remaining part of the string
parts.push(input[..last_split].to_string());
let mut all_matched = true;
let mut split = EncodedSplit::new();
parts.reverse();
for part in &parts {
let Ok(num_part) = part.parse::<u64>() else {
all_matched = false;
break;
};
let Some(words) = self.lv_map.get(part.len() as u8, num_part) else {
all_matched = false;
break;
};
split.push(EncodedPart {
value: num_part,
words: words.clone(),
});
}
if all_matched {
results.push(Partition {
value: split,
// To find the "most equal" size, we minimize the sum of squared lengths.
// (This mathematically minimizes variance without needing floating point math)
sum_sq_len: parts.iter().map(|p| p.chars().count().pow(2)).sum(),
});
}
// Calculate metrics for sorting
// let num_parts = parts.len();
// // To find the "most equal" size, we minimize the sum of squared lengths.
// // (This mathematically minimizes variance without needing floating point math)
// let sum_sq_len: usize = parts.iter().map(|p| p.chars().count().pow(2)).sum();
// if let Some(words) = self.lv_map.get(size as u8, input.parse().unwrap()) {
// results.push(Partition {
// parts: words.clone(),
// sum_sq_len,
// });
// }
}
// Ok(EncodedValue::new(words))
// Sort by:
// 1. Fewer parts first (1 part, then 2 parts...)
// 2. Most equal lengths (lower sum of squared lengths is more balanced)
// 3. TODO: Lexicographically (for deterministic stability)?
results.sort_by(|a, b| {
a.value
.len()
.cmp(&b.value.len())
.then(a.sum_sq_len.cmp(&b.sum_sq_len))
});
// Extract just the strings
let split_results = results.into_iter().map(|p| p.value).collect();
Ok(EncodedValue::new(split_results))
}
}
// A helper struct to keep the split variant and its sort metrics together
struct Partition {
value: EncodedSplit,
sum_sq_len: usize,
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_encode_as_single_length_result() {
let mut lvmap = LenValueMap::new();
lvmap.push(3, 123, "test_123");
lvmap.push(3, 345, "test_345_1");
lvmap.push(3, 345, "test_345_2");
lvmap.push(3, 678, "test_678");
let encoder = Encoder::new(lvmap);
let result = encoder.encode("345").unwrap();
assert_eq!(result.len(), 1); // single split
assert_eq!(result[0].len(), 1); // single part
assert_eq!(result[0][0].value, 345);
assert_eq!(result[0][0].words.len(), 2); // two words
assert_eq!(result[0][0].words[0], "test_345_1");
assert_eq!(result[0][0].words[1], "test_345_2");
}
#[test]
fn test_encode_as_all_lengths() {
let mut lvmap = LenValueMap::new();
lvmap.push(1, 0, "test_0");
lvmap.push(1, 9, "test_9");
lvmap.push(1, 8, "test_8");
lvmap.push(1, 7, "test_7");
lvmap.push(2, 98, "test_98");
lvmap.push(2, 87, "test_87");
lvmap.push(3, 987, "test_987");
lvmap.push(3, 876, "test_876");
let encoder = Encoder::new(lvmap);
let result = encoder.encode("987").unwrap();
assert_eq!(result.len(), 4); // 987, 98|7, 9|87, 9|8|7
assert_eq!(result[0].len(), 1); // 987
assert_eq!(result[0][0].words.len(), 1);
assert_eq!(result[0][0].words[0], "test_987");
assert_eq!(result[1].len(), 2); // 98|7
assert_eq!(result[1][0].words.len(), 1);
assert_eq!(result[1][0].words[0], "test_98");
assert_eq!(result[1][1].words.len(), 1);
assert_eq!(result[1][1].words[0], "test_7");
assert_eq!(result[2].len(), 2); // 9|87
assert_eq!(result[2][0].words.len(), 1);
assert_eq!(result[2][0].words[0], "test_9");
assert_eq!(result[2][1].words.len(), 1);
assert_eq!(result[2][1].words[0], "test_87");
assert_eq!(result[3].len(), 3); // 9|8|7
assert_eq!(result[3][0].words.len(), 1);
assert_eq!(result[3][0].words[0], "test_9");
assert_eq!(result[3][1].words.len(), 1);
assert_eq!(result[3][1].words[0], "test_8");
assert_eq!(result[3][2].words.len(), 1);
assert_eq!(result[3][2].words[0], "test_7");
}
}