Skip to content
This repository was archived by the owner on Jan 2, 2025. It is now read-only.
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions server/bleep/src/semantic/chunk.rs
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,24 @@ impl Default for OverlapStrategy {
}
}

/// Heuristics for determining if a chunk is noisy
///
/// We filter chunks where over 50% of non-whitespace tokens are numeric or punctuation
fn is_noisy(chunk: &str) -> bool {
let non_whitespace_count = chunk.chars().filter(|c| !c.is_whitespace()).count();
let numeric_or_punctuation_count = chunk
.chars()
.filter(|c| c.is_numeric() || c.is_ascii_punctuation())
.count();

// Chunks that are all whitespace are noisy
if non_whitespace_count == 0 {
return true;
}

(numeric_or_punctuation_count as f64 / non_whitespace_count as f64) > 0.5
}

/// This should take care of [CLS], [SEP] etc. which could be introduced during per-chunk tokenization
pub const DEDUCT_SPECIAL_TOKENS: usize = 2;

Expand All @@ -173,6 +191,11 @@ fn add_token_range<'s>(
return;
}

if is_noisy(&src[start_byte..end_byte]) {
debug!("skipping noisy chunk");
return;
}

debug_assert!(
o.end - o.start < 256,
"chunk too large: {} tokens in {:?} bytes {:?}",
Expand Down