phomnemic/lib/src/sys_major/encoder.rs

use crate::common::{
    entities::{EncodedPart, EncodedSplit, EncodedValue},
    errors::CodecError,
    traits::*,
};

use super::lvmap::LenValueMap;

#[derive(Debug)]
pub struct Encoder {
    lv_map: LenValueMap,
}

impl Encoder {
    pub fn new(lv_map: LenValueMap) -> Self {
        Encoder { lv_map }
    }
}

impl SystemEncoder for Encoder {
    fn initialize(&self) -> Result<(), CodecError> {
        Ok(())
    }

    fn encode(&self, input: &str) -> Result<EncodedValue, CodecError> {
        let size = input.chars().count();
        let max_mask: usize = (1 << (size - 1)) - 1;

        let indices: Vec<usize> = input.char_indices().map(|(i, _)| i).collect();
        let mut results = Vec::with_capacity(max_mask);

        for mask in 0..=max_mask {
            let mut parts: Vec<String> = Vec::new();
            let mut last_split = input.char_indices().count(); // we go from right to left to start with the longest parts

            // Iterate through the mask bits to find where to split
            for i in 0..size - 1 {
                // Check if the i-th bit is set
                if (mask >> i) & 1 == 1 {
                    // The split corresponds to the byte index of the (i+1)-th character
                    let split_idx = indices[indices.len() - i - 1];
                    parts.push(input[split_idx..last_split].to_string());
                    last_split = split_idx;
                }
            }
            // Push the remaining part of the string
            parts.push(input[..last_split].to_string());

            let mut all_matched = true;
            let mut split = EncodedSplit::new();
            parts.reverse();

            for part in &parts {
                let Ok(num_part) = part.parse::<u64>() else {
                    all_matched = false;
                    break;
                };
                let Some(words) = self.lv_map.get(part.len() as u8, num_part) else {
                    all_matched = false;
                    break;
                };
                split.push(EncodedPart {
                    value: num_part,
                    words: words.clone(),
                });
            }

            if all_matched {
                results.push(Partition {
                    value: split,
                    // To find the "most equal" size, we minimize the sum of squared lengths.
                    // (This mathematically minimizes variance without needing floating point math)
                    sum_sq_len: parts.iter().map(|p| p.chars().count().pow(2)).sum(),
                });
            }

            // Calculate metrics for sorting
            // let num_parts = parts.len();

            // // To find the "most equal" size, we minimize the sum of squared lengths.
            // // (This mathematically minimizes variance without needing floating point math)
            // let sum_sq_len: usize = parts.iter().map(|p| p.chars().count().pow(2)).sum();

            // if let Some(words) = self.lv_map.get(size as u8, input.parse().unwrap()) {
            //     results.push(Partition {
            //         parts: words.clone(),
            //         sum_sq_len,
            //     });
            // }
        }

        // Ok(EncodedValue::new(words))
        // Sort by:
        // 1. Fewer parts first (1 part, then 2 parts...)
        // 2. Most equal lengths (lower sum of squared lengths is more balanced)
        // 3. TODO: Lexicographically (for deterministic stability)?
        results.sort_by(|a, b| {
            a.value
                .len()
                .cmp(&b.value.len())
                .then(a.sum_sq_len.cmp(&b.sum_sq_len))
        });

        // Extract just the strings
        let split_results = results.into_iter().map(|p| p.value).collect();
        Ok(EncodedValue::new(split_results))
    }
}

// A helper struct to keep the split variant and its sort metrics together
struct Partition {
    value: EncodedSplit,
    sum_sq_len: usize,
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_encode_as_single_length_result() {
        let mut lvmap = LenValueMap::new();
        lvmap.push(3, 123, "test_123");
        lvmap.push(3, 345, "test_345_1");
        lvmap.push(3, 345, "test_345_2");
        lvmap.push(3, 678, "test_678");
        let encoder = Encoder::new(lvmap);
        let result = encoder.encode("345").unwrap();

        assert_eq!(result.len(), 1); // single split
        assert_eq!(result[0].len(), 1); // single part
        assert_eq!(result[0][0].value, 345);
        assert_eq!(result[0][0].words.len(), 2); // two words
        assert_eq!(result[0][0].words[0], "test_345_1");
        assert_eq!(result[0][0].words[1], "test_345_2");
    }

    #[test]
    fn test_encode_as_all_lengths() {
        let mut lvmap = LenValueMap::new();
        lvmap.push(1, 0, "test_0");
        lvmap.push(1, 9, "test_9");
        lvmap.push(1, 8, "test_8");
        lvmap.push(1, 7, "test_7");
        lvmap.push(2, 98, "test_98");
        lvmap.push(2, 87, "test_87");
        lvmap.push(3, 987, "test_987");
        lvmap.push(3, 876, "test_876");

        let encoder = Encoder::new(lvmap);
        let result = encoder.encode("987").unwrap();

        assert_eq!(result.len(), 4); // 987, 98|7, 9|87, 9|8|7
        assert_eq!(result[0].len(), 1); // 987

        assert_eq!(result[0][0].words.len(), 1);
        assert_eq!(result[0][0].words[0], "test_987");

        assert_eq!(result[1].len(), 2); // 98|7
        assert_eq!(result[1][0].words.len(), 1);
        assert_eq!(result[1][0].words[0], "test_98");
        assert_eq!(result[1][1].words.len(), 1);
        assert_eq!(result[1][1].words[0], "test_7");

        assert_eq!(result[2].len(), 2); // 9|87
        assert_eq!(result[2][0].words.len(), 1);
        assert_eq!(result[2][0].words[0], "test_9");
        assert_eq!(result[2][1].words.len(), 1);
        assert_eq!(result[2][1].words[0], "test_87");

        assert_eq!(result[3].len(), 3); // 9|8|7
        assert_eq!(result[3][0].words.len(), 1);
        assert_eq!(result[3][0].words[0], "test_9");
        assert_eq!(result[3][1].words.len(), 1);
        assert_eq!(result[3][1].words[0], "test_8");
        assert_eq!(result[3][2].words.len(), 1);
        assert_eq!(result[3][2].words[0], "test_7");
    }
}