#include "SentimentAnalyzer.hpp" #include #include #include SentimentAnalyzer::SentimentAnalyzer() { loadLexicon(); } void SentimentAnalyzer::loadLexicon() { // Extended lexicon with English and Chinese sentiment words lexicon = { // English {"good", 1.0}, {"great", 1.5}, {"excellent", 2.0}, {"amazing", 2.0}, {"happy", 1.0}, {"love", 1.5}, {"bad", -1.0}, {"terrible", -1.5}, {"awful", -2.0}, {"hate", -1.5}, {"sad", -1.0}, {"worst", -2.0}, // Chinese Positive {"好", 1.0}, {"棒", 1.5}, {"优秀", 2.0}, {"精彩", 2.0}, {"开心", 1.0}, {"喜欢", 1.5}, {"不错", 1.2}, {"满意", 1.5}, {"爱", 1.5}, {"快乐", 1.0}, {"推荐", 1.5}, {"给力", 1.5}, // Chinese Negative {"坏", -1.0}, {"差", -1.0}, {"糟糕", -1.5}, {"讨厌", -1.5}, {"难过", -1.0}, {"慢", -1.0}, {"拥堵", -1.0}, {"失望", -1.5}, {"垃圾", -2.0}, {"恨", -1.5}, {"悲伤", -1.0}, {"愤怒", -1.5}, // Chinese Complex/Mixed (handled as neutral or specific weights) {"复杂", -0.5} // "Complex" often implies mixed feelings }; } // Helper to determine UTF-8 character length int get_utf8_sequence_length(char c) { if ((c & 0x80) == 0) return 1; if ((c & 0xE0) == 0xC0) return 2; if ((c & 0xF0) == 0xE0) return 3; if ((c & 0xF8) == 0xF0) return 4; return 1; // Fallback } std::vector SentimentAnalyzer::tokenize(std::string_view text) const { std::vector tokens; size_t i = 0; while (i < text.length()) { // 1. Check for dictionary match (Max Match) std::string best_match; for (const auto& [word, score] : lexicon) { if (text.substr(i).rfind(word, 0) == 0) { // Starts with word if (word.length() > best_match.length()) { best_match = word; } } } if (!best_match.empty()) { tokens.push_back(best_match); i += best_match.length(); continue; } // 2. No dictionary match: Handle English/Numbers or Single UTF-8 Char char c = text[i]; if (std::isalnum(c)) { // Consume full alphanumeric word std::string word; while (i < text.length() && std::isalnum(text[i])) { word += std::tolower(text[i]); i++; } tokens.push_back(word); } else { // Check if ASCII (high bit 0) if ((c & 0x80) == 0) { // It's ASCII but not alphanumeric (punctuation, space, etc.) // Skip it to match legacy behavior (and pass unit tests) i++; continue; } // Consume one UTF-8 character or symbol (high bit 1) int len = get_utf8_sequence_length(c); // Ensure we don't read past end if (i + len > text.length()) len = 1; std::string utf8_char(text.substr(i, len)); // Only add non-whitespace bool is_space = true; for (char b : utf8_char) { if (!std::isspace(b)) { is_space = false; break; } } if (!is_space) { tokens.push_back(utf8_char); } i += len; } } return tokens; } double SentimentAnalyzer::analyze(std::string_view text) const { auto tokens = tokenize(text); if (tokens.empty()) return 0.0; double score = 0.0; for (const auto& token : tokens) { if (lexicon.count(token)) { score += lexicon.at(token); } } // Normalize score roughly between -1 and 1 for typical short sentences // This is a simplified normalization logic double normalized_score = std::max(-1.0, std::min(1.0, score / (tokens.size() * 0.5 + 1))); return normalized_score; } // C Wrapper for ctypes extern "C" { SentimentAnalyzer* SentimentAnalyzer_create() { return new SentimentAnalyzer(); } double SentimentAnalyzer_analyze(SentimentAnalyzer* analyzer, const char* text) { if (analyzer && text) { return analyzer->analyze(text); } return 0.0; } void SentimentAnalyzer_destroy(SentimentAnalyzer* analyzer) { delete analyzer; } // Tokenization Wrappers typedef void* TokenListHandle; TokenListHandle SentimentAnalyzer_tokenize(SentimentAnalyzer* analyzer, const char* text) { if (!analyzer || !text) return nullptr; // Call C++ tokenize auto tokens = analyzer->tokenize(text); // Return a pointer to a new heap-allocated vector return new std::vector(tokens); } int TokenList_get_size(TokenListHandle handle) { if (!handle) return 0; auto* tokens = static_cast*>(handle); return static_cast(tokens->size()); } const char* TokenList_get_token(TokenListHandle handle, int index) { if (!handle) return nullptr; auto* tokens = static_cast*>(handle); if (index < 0 || index >= static_cast(tokens->size())) return nullptr; // .c_str() is valid as long as the vector exists and isn't modified return (*tokens)[index].c_str(); } void TokenList_destroy(TokenListHandle handle) { if (handle) { delete static_cast*>(handle); } } }