Spaces:

Luigi
/

tiny-scribe

Running

App Files Files Community

tiny-scribe / embedding_model_analysis.json

Luigi

Add embedding model analysis documentation

c9955a9 4 days ago

raw

history blame contribute delete

2.48 kB

	{
	"current_model": {
	"name": "Granite-107M-Multilingual",
	"repo": "ibm-granite/granite-embedding-107m-multilingual",
	"params": "107M",
	"pros": [
	"Already integrated and working",
	"Fast (107M parameters)",
	"Proven in production tests",
	"Correctly deduplicated Gemma-3 (47.8% dupes)",
	"0% false positives with Qwen2.5 1.5B"
	],
	"cons": [
	"Smaller model (107M vs 500M+)",
	"May miss nuanced similarities"
	],
	"test_results": {
	"qwen2.5_1.5b_extraction": {
	"duplicate_rate": "0%",
	"deduplication_accuracy": "100%",
	"note": "Extraction already unique per window"
	},
	"gemma3_1b_extraction": {
	"duplicate_rate": "47.8%",
	"deduplication_accuracy": "100%",
	"note": "Correctly identified all duplicates"
	}
	}
	},
	"alternatives": {
	"bge_m3": {
	"name": "BGE-M3",
	"repo": "BAAI/bge-m3",
	"gguf_repo": "lm-kit/bge-m3-gguf",
	"params": "568M",
	"pros": [
	"SOTA on MTEB Chinese benchmarks",
	"Larger model (568M vs 107M)",
	"Better semantic understanding"
	],
	"cons": [
	"5x larger (slower)",
	"Requires sentence-transformers (not GGUF)",
	"Unknown if GGUF version works with llama-cpp"
	],
	"recommendation": "Worth testing if accuracy issues arise"
	},
	"multilingual_e5": {
	"name": "Multilingual-E5-Large",
	"repo": "intfloat/multilingual-e5-large",
	"params": "560M",
	"pros": [
	"Microsoft-backed, widely tested",
	"Excellent for multilingual",
	"Good for Chinese text"
	],
	"cons": [
	"5x larger than Granite-107M",
	"Requires sentence-transformers",
	"No GGUF version readily available"
	],
	"recommendation": "Consider if switching to sentence-transformers"
	}
	},
	"recommendation": {
	"current_status": "KEEP Granite-107M",
	"rationale": [
	"Working correctly in production",
	"Fast enough for real-time use",
	"Zero false positives in tests",
	"Simple GGUF integration"
	],
	"when_to_upgrade": [
	"If false positives/negatives appear in production",
	"If need better semantic matching (not just exact duplicates)",
	"If processing very long texts (need better context understanding)"
	],
	"suggested_thresholds": {
	"strict": 0.9,
	"default": 0.85,
	"lenient": 0.8
	}
	}
	}