You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
179 lines
6.2 KiB
179 lines
6.2 KiB
use crate::common::{ |
|
entities::{EncodedPart, EncodedSplit, EncodedValue}, |
|
errors::CodecError, |
|
traits::*, |
|
}; |
|
|
|
use super::lvmap::LenValueMap; |
|
|
|
#[derive(Debug)] |
|
pub struct Encoder { |
|
lv_map: LenValueMap, |
|
} |
|
|
|
impl Encoder { |
|
pub fn new(lv_map: LenValueMap) -> Self { |
|
Encoder { lv_map } |
|
} |
|
} |
|
|
|
impl SystemEncoder for Encoder { |
|
fn initialize(&self) -> Result<(), CodecError> { |
|
Ok(()) |
|
} |
|
|
|
fn encode(&self, input: &str) -> Result<EncodedValue, CodecError> { |
|
let size = input.chars().count(); |
|
let max_mask: usize = (1 << (size - 1)) - 1; |
|
|
|
let indices: Vec<usize> = input.char_indices().map(|(i, _)| i).collect(); |
|
let mut results = Vec::with_capacity(max_mask); |
|
|
|
for mask in 0..=max_mask { |
|
let mut parts: Vec<String> = Vec::new(); |
|
let mut last_split = input.char_indices().count(); // we go from right to left to start with the longest parts |
|
|
|
// Iterate through the mask bits to find where to split |
|
for i in 0..size - 1 { |
|
// Check if the i-th bit is set |
|
if (mask >> i) & 1 == 1 { |
|
// The split corresponds to the byte index of the (i+1)-th character |
|
let split_idx = indices[indices.len() - i - 1]; |
|
parts.push(input[split_idx..last_split].to_string()); |
|
last_split = split_idx; |
|
} |
|
} |
|
// Push the remaining part of the string |
|
parts.push(input[..last_split].to_string()); |
|
|
|
let mut all_matched = true; |
|
let mut split = EncodedSplit::new(); |
|
parts.reverse(); |
|
|
|
for part in &parts { |
|
let Ok(num_part) = part.parse::<u64>() else { |
|
all_matched = false; |
|
break; |
|
}; |
|
let Some(words) = self.lv_map.get(part.len() as u8, num_part) else { |
|
all_matched = false; |
|
break; |
|
}; |
|
split.push(EncodedPart { |
|
value: num_part, |
|
words: words.clone(), |
|
}); |
|
} |
|
|
|
if all_matched { |
|
results.push(Partition { |
|
value: split, |
|
// To find the "most equal" size, we minimize the sum of squared lengths. |
|
// (This mathematically minimizes variance without needing floating point math) |
|
sum_sq_len: parts.iter().map(|p| p.chars().count().pow(2)).sum(), |
|
}); |
|
} |
|
|
|
// Calculate metrics for sorting |
|
// let num_parts = parts.len(); |
|
|
|
// // To find the "most equal" size, we minimize the sum of squared lengths. |
|
// // (This mathematically minimizes variance without needing floating point math) |
|
// let sum_sq_len: usize = parts.iter().map(|p| p.chars().count().pow(2)).sum(); |
|
|
|
// if let Some(words) = self.lv_map.get(size as u8, input.parse().unwrap()) { |
|
// results.push(Partition { |
|
// parts: words.clone(), |
|
// sum_sq_len, |
|
// }); |
|
// } |
|
} |
|
|
|
// Ok(EncodedValue::new(words)) |
|
// Sort by: |
|
// 1. Fewer parts first (1 part, then 2 parts...) |
|
// 2. Most equal lengths (lower sum of squared lengths is more balanced) |
|
// 3. TODO: Lexicographically (for deterministic stability)? |
|
results.sort_by(|a, b| { |
|
a.value |
|
.len() |
|
.cmp(&b.value.len()) |
|
.then(a.sum_sq_len.cmp(&b.sum_sq_len)) |
|
}); |
|
|
|
// Extract just the strings |
|
let split_results = results.into_iter().map(|p| p.value).collect(); |
|
Ok(EncodedValue::new(split_results)) |
|
} |
|
} |
|
|
|
// A helper struct to keep the split variant and its sort metrics together |
|
struct Partition { |
|
value: EncodedSplit, |
|
sum_sq_len: usize, |
|
} |
|
|
|
#[cfg(test)] |
|
mod tests { |
|
use super::*; |
|
|
|
#[test] |
|
fn test_encode_as_single_length_result() { |
|
let mut lvmap = LenValueMap::new(); |
|
lvmap.push(3, 123, "test_123"); |
|
lvmap.push(3, 345, "test_345_1"); |
|
lvmap.push(3, 345, "test_345_2"); |
|
lvmap.push(3, 678, "test_678"); |
|
let encoder = Encoder::new(lvmap); |
|
let result = encoder.encode("345").unwrap(); |
|
|
|
assert_eq!(result.len(), 1); // single split |
|
assert_eq!(result[0].len(), 1); // single part |
|
assert_eq!(result[0][0].value, 345); |
|
assert_eq!(result[0][0].words.len(), 2); // two words |
|
assert_eq!(result[0][0].words[0], "test_345_1"); |
|
assert_eq!(result[0][0].words[1], "test_345_2"); |
|
} |
|
|
|
#[test] |
|
fn test_encode_as_all_lengths() { |
|
let mut lvmap = LenValueMap::new(); |
|
lvmap.push(1, 0, "test_0"); |
|
lvmap.push(1, 9, "test_9"); |
|
lvmap.push(1, 8, "test_8"); |
|
lvmap.push(1, 7, "test_7"); |
|
lvmap.push(2, 98, "test_98"); |
|
lvmap.push(2, 87, "test_87"); |
|
lvmap.push(3, 987, "test_987"); |
|
lvmap.push(3, 876, "test_876"); |
|
|
|
let encoder = Encoder::new(lvmap); |
|
let result = encoder.encode("987").unwrap(); |
|
|
|
assert_eq!(result.len(), 4); // 987, 98|7, 9|87, 9|8|7 |
|
assert_eq!(result[0].len(), 1); // 987 |
|
|
|
assert_eq!(result[0][0].words.len(), 1); |
|
assert_eq!(result[0][0].words[0], "test_987"); |
|
|
|
assert_eq!(result[1].len(), 2); // 98|7 |
|
assert_eq!(result[1][0].words.len(), 1); |
|
assert_eq!(result[1][0].words[0], "test_98"); |
|
assert_eq!(result[1][1].words.len(), 1); |
|
assert_eq!(result[1][1].words[0], "test_7"); |
|
|
|
assert_eq!(result[2].len(), 2); // 9|87 |
|
assert_eq!(result[2][0].words.len(), 1); |
|
assert_eq!(result[2][0].words[0], "test_9"); |
|
assert_eq!(result[2][1].words.len(), 1); |
|
assert_eq!(result[2][1].words[0], "test_87"); |
|
|
|
assert_eq!(result[3].len(), 3); // 9|8|7 |
|
assert_eq!(result[3][0].words.len(), 1); |
|
assert_eq!(result[3][0].words[0], "test_9"); |
|
assert_eq!(result[3][1].words.len(), 1); |
|
assert_eq!(result[3][1].words[0], "test_8"); |
|
assert_eq!(result[3][2].words.len(), 1); |
|
assert_eq!(result[3][2].words[0], "test_7"); |
|
} |
|
}
|
|
|