115 lines
3.6 KiB
Rust
115 lines
3.6 KiB
Rust
use std::collections::BTreeSet;
|
|
use std::collections::HashMap;
|
|
use std::env;
|
|
use std::fs;
|
|
use std::path::{Path, PathBuf};
|
|
|
|
const PREMIUM_DIR: &str = "src/dictionaries/premium";
|
|
const TRADEMARKS_DIR: &str = "src/dictionaries/trademarks";
|
|
|
|
fn normalize_word(word: &str) -> Option<String> {
|
|
let w = word.trim().to_ascii_lowercase();
|
|
if w.is_empty() || w.len() > 20 {
|
|
return None;
|
|
}
|
|
if !w.chars().all(|c| c.is_ascii_alphanumeric()) {
|
|
return None;
|
|
}
|
|
Some(w)
|
|
}
|
|
|
|
fn gather_files(dir: &Path) -> Vec<PathBuf> {
|
|
let mut files = Vec::new();
|
|
if let Ok(entries) = fs::read_dir(dir) {
|
|
for entry in entries.flatten() {
|
|
let p = entry.path();
|
|
if p.is_dir() {
|
|
files.extend(gather_files(&p));
|
|
} else if p.extension().and_then(|s| s.to_str()) == Some("txt") {
|
|
files.push(p);
|
|
}
|
|
}
|
|
}
|
|
files.sort();
|
|
files
|
|
}
|
|
|
|
fn load_word_set(dir: &Path, label: &str) -> BTreeSet<String> {
|
|
let mut out = BTreeSet::new();
|
|
let mut seen: HashMap<String, usize> = HashMap::new();
|
|
for file in gather_files(dir) {
|
|
println!("cargo:rerun-if-changed={}", file.display());
|
|
let raw = fs::read_to_string(&file).unwrap_or_default();
|
|
for line in raw.lines() {
|
|
let line = line.trim();
|
|
if line.is_empty() || line.starts_with('#') {
|
|
continue;
|
|
}
|
|
if let Some(w) = normalize_word(line) {
|
|
*seen.entry(w.clone()).or_insert(0) += 1;
|
|
out.insert(w);
|
|
}
|
|
}
|
|
}
|
|
let mut duplicate_words = 0usize;
|
|
let mut duplicate_entries = 0usize;
|
|
let mut sample: Vec<String> = Vec::new();
|
|
let mut keys: Vec<_> = seen.keys().cloned().collect();
|
|
keys.sort();
|
|
for k in keys {
|
|
if let Some(cnt) = seen.get(&k) {
|
|
if *cnt > 1 {
|
|
duplicate_words += 1;
|
|
duplicate_entries += cnt - 1;
|
|
if sample.len() < 40 {
|
|
sample.push(format!("{k} x{cnt}"));
|
|
}
|
|
}
|
|
}
|
|
}
|
|
if duplicate_words > 0 {
|
|
println!(
|
|
"cargo:warning=[{label}] duplicates found: words={}, extra_entries={}",
|
|
duplicate_words, duplicate_entries
|
|
);
|
|
println!(
|
|
"cargo:warning=[{label}] duplicate samples: {}",
|
|
sample.join(", ")
|
|
);
|
|
}
|
|
out
|
|
}
|
|
|
|
fn main() {
|
|
let premium_dir = Path::new(PREMIUM_DIR);
|
|
let trademarks_dir = Path::new(TRADEMARKS_DIR);
|
|
println!("cargo:rerun-if-changed={}", premium_dir.display());
|
|
println!("cargo:rerun-if-changed={}", trademarks_dir.display());
|
|
|
|
let premium = load_word_set(premium_dir, "premium");
|
|
let trademarks = load_word_set(trademarks_dir, "trademarks");
|
|
|
|
let premium_words: Vec<String> = premium.into_iter().collect();
|
|
let trademark_words: Vec<String> = trademarks.into_iter().collect();
|
|
let mut out = String::new();
|
|
out.push_str("// @generated by build.rs\n");
|
|
out.push_str("pub static PREMIUM_WORDS: &[&str] = &[\n");
|
|
for w in &premium_words {
|
|
out.push_str(" \"");
|
|
out.push_str(w);
|
|
out.push_str("\",\n");
|
|
}
|
|
out.push_str("];\n");
|
|
out.push_str("pub static TRADEMARK_WORDS: &[&str] = &[\n");
|
|
for w in &trademark_words {
|
|
out.push_str(" \"");
|
|
out.push_str(w);
|
|
out.push_str("\",\n");
|
|
}
|
|
out.push_str("];\n");
|
|
|
|
let out_dir = env::var("OUT_DIR").expect("OUT_DIR is not set");
|
|
let dst = Path::new(&out_dir).join("generated_dictionary.rs");
|
|
fs::write(dst, out).expect("failed to write generated dictionary");
|
|
}
|