|
|
|
@ -16,6 +16,109 @@ impl SystemEncoder for Encoder { |
|
|
|
Ok(()) |
|
|
|
Ok(()) |
|
|
|
} |
|
|
|
} |
|
|
|
fn encode(&self, word: &str) -> Result<EncodedValue, CodecError> { |
|
|
|
fn encode(&self, word: &str) -> Result<EncodedValue, CodecError> { |
|
|
|
todo!() |
|
|
|
let size = word.chars().count(); |
|
|
|
|
|
|
|
let max_mask: usize = (1 << (size - 1)) - 1; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
let indices: Vec<usize> = word.char_indices().map(|(i, _)| i).collect(); |
|
|
|
|
|
|
|
let mut results = Vec::with_capacity(max_mask); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
for mask in 0..=max_mask { |
|
|
|
|
|
|
|
let mut parts = Vec::new(); |
|
|
|
|
|
|
|
let mut last_split = 0; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Iterate through the mask bits to find where to split
|
|
|
|
|
|
|
|
for i in 0..size - 1 { |
|
|
|
|
|
|
|
// Check if the i-th bit is set
|
|
|
|
|
|
|
|
if (mask >> i) & 1 == 1 { |
|
|
|
|
|
|
|
// The split corresponds to the byte index of the (i+1)-th character
|
|
|
|
|
|
|
|
let split_idx = indices[i + 1]; |
|
|
|
|
|
|
|
parts.push(&word[last_split..split_idx]); |
|
|
|
|
|
|
|
last_split = split_idx; |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
// Push the remaining part of the string
|
|
|
|
|
|
|
|
parts.push(&word[last_split..]); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Calculate metrics for sorting
|
|
|
|
|
|
|
|
let num_parts = parts.len(); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// To find the "most equal" size, we minimize the sum of squared lengths.
|
|
|
|
|
|
|
|
// (This mathematically minimizes variance without needing floating point math)
|
|
|
|
|
|
|
|
let sum_sq_len: usize = parts.iter().map(|p| p.chars().count().pow(2)).sum(); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Construct the final string representation (e.g., "abc|de|fg")
|
|
|
|
|
|
|
|
let result_string = parts.join("|"); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
results.push(Partition { |
|
|
|
|
|
|
|
word: result_string, |
|
|
|
|
|
|
|
num_parts, |
|
|
|
|
|
|
|
sum_sq_len, |
|
|
|
|
|
|
|
}); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Ok(EncodedValue::new(words))
|
|
|
|
|
|
|
|
// Sort by:
|
|
|
|
|
|
|
|
// 1. Fewer parts first (1 part, then 2 parts...)
|
|
|
|
|
|
|
|
// 2. Most equal lengths (lower sum of squared lengths is more balanced)
|
|
|
|
|
|
|
|
// 3. Lexicographically (for deterministic stability)
|
|
|
|
|
|
|
|
results.sort_by(|a, b| { |
|
|
|
|
|
|
|
a.num_parts |
|
|
|
|
|
|
|
.cmp(&b.num_parts) |
|
|
|
|
|
|
|
.then(a.sum_sq_len.cmp(&b.sum_sq_len)) |
|
|
|
|
|
|
|
.then(a.word.cmp(&b.word)) |
|
|
|
|
|
|
|
}); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Extract just the strings
|
|
|
|
|
|
|
|
let words = results.into_iter().map(|p| p.word).collect(); |
|
|
|
|
|
|
|
Ok(EncodedValue::new(words)) |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// A helper struct to keep the string and its sort metrics together
|
|
|
|
|
|
|
|
struct Partition { |
|
|
|
|
|
|
|
word: String, |
|
|
|
|
|
|
|
num_parts: usize, |
|
|
|
|
|
|
|
sum_sq_len: usize, |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#[cfg(test)] |
|
|
|
|
|
|
|
mod tests { |
|
|
|
|
|
|
|
use super::*; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#[test] |
|
|
|
|
|
|
|
fn test_encode_as_single_length_result() { |
|
|
|
|
|
|
|
let mut lvmap = LenValueMap::new(); |
|
|
|
|
|
|
|
lvmap.push(3, 123, "test_123"); |
|
|
|
|
|
|
|
lvmap.push(3, 345, "test_345_1"); |
|
|
|
|
|
|
|
lvmap.push(3, 345, "test_345_2"); |
|
|
|
|
|
|
|
lvmap.push(3, 678, "test_678"); |
|
|
|
|
|
|
|
let encoder = Encoder::new(lvmap); |
|
|
|
|
|
|
|
let result = encoder.encode("345").unwrap(); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
assert_eq!(result.len(), 2); |
|
|
|
|
|
|
|
assert!(result.contains(&"test_345_1".into())); |
|
|
|
|
|
|
|
assert!(result.contains(&"test_345_2".into())); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#[test] |
|
|
|
|
|
|
|
fn test_encode_as_all_lengths() { |
|
|
|
|
|
|
|
let mut lvmap = LenValueMap::new(); |
|
|
|
|
|
|
|
lvmap.push(1, 0, "test_0"); |
|
|
|
|
|
|
|
lvmap.push(1, 9, "test_9"); |
|
|
|
|
|
|
|
lvmap.push(1, 8, "test_8"); |
|
|
|
|
|
|
|
lvmap.push(2, 98, "test_98"); |
|
|
|
|
|
|
|
lvmap.push(2, 87, "test_87"); |
|
|
|
|
|
|
|
lvmap.push(3, 987, "test_987"); |
|
|
|
|
|
|
|
lvmap.push(3, 876, "test_876"); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
let encoder = Encoder::new(lvmap); |
|
|
|
|
|
|
|
let result = encoder.encode("987").unwrap(); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
assert_eq!(result.len(), 5); |
|
|
|
|
|
|
|
assert!(result.contains(&"test_987".into())); |
|
|
|
|
|
|
|
assert!(result.contains(&"test_98".into())); |
|
|
|
|
|
|
|
assert!(result.contains(&"test_87".into())); |
|
|
|
|
|
|
|
assert!(result.contains(&"test_9".into())); |
|
|
|
|
|
|
|
assert!(result.contains(&"test_8".into())); |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
|