Spaces:
Sleeping
Sleeping
| SentimentAnalyzer::SentimentAnalyzer() { | |
| loadLexicon(); | |
| } | |
| void SentimentAnalyzer::loadLexicon() { | |
| // Extended lexicon with English and Chinese sentiment words | |
| lexicon = { | |
| // English | |
| {"good", 1.0}, {"great", 1.5}, {"excellent", 2.0}, {"amazing", 2.0}, | |
| {"happy", 1.0}, {"love", 1.5}, {"bad", -1.0}, {"terrible", -1.5}, | |
| {"awful", -2.0}, {"hate", -1.5}, {"sad", -1.0}, {"worst", -2.0}, | |
| // Chinese Positive | |
| {"好", 1.0}, {"棒", 1.5}, {"优秀", 2.0}, {"精彩", 2.0}, | |
| {"开心", 1.0}, {"喜欢", 1.5}, {"不错", 1.2}, {"满意", 1.5}, | |
| {"爱", 1.5}, {"快乐", 1.0}, {"推荐", 1.5}, {"给力", 1.5}, | |
| // Chinese Negative | |
| {"坏", -1.0}, {"差", -1.0}, {"糟糕", -1.5}, {"讨厌", -1.5}, | |
| {"难过", -1.0}, {"慢", -1.0}, {"拥堵", -1.0}, {"失望", -1.5}, | |
| {"垃圾", -2.0}, {"恨", -1.5}, {"悲伤", -1.0}, {"愤怒", -1.5}, | |
| // Chinese Complex/Mixed (handled as neutral or specific weights) | |
| {"复杂", -0.5} // "Complex" often implies mixed feelings | |
| }; | |
| } | |
| // Helper to determine UTF-8 character length | |
| int get_utf8_sequence_length(char c) { | |
| if ((c & 0x80) == 0) return 1; | |
| if ((c & 0xE0) == 0xC0) return 2; | |
| if ((c & 0xF0) == 0xE0) return 3; | |
| if ((c & 0xF8) == 0xF0) return 4; | |
| return 1; // Fallback | |
| } | |
| std::vector<std::string> SentimentAnalyzer::tokenize(std::string_view text) const { | |
| std::vector<std::string> tokens; | |
| size_t i = 0; | |
| while (i < text.length()) { | |
| // 1. Check for dictionary match (Max Match) | |
| std::string best_match; | |
| for (const auto& [word, score] : lexicon) { | |
| if (text.substr(i).rfind(word, 0) == 0) { // Starts with word | |
| if (word.length() > best_match.length()) { | |
| best_match = word; | |
| } | |
| } | |
| } | |
| if (!best_match.empty()) { | |
| tokens.push_back(best_match); | |
| i += best_match.length(); | |
| continue; | |
| } | |
| // 2. No dictionary match: Handle English/Numbers or Single UTF-8 Char | |
| char c = text[i]; | |
| if (std::isalnum(c)) { | |
| // Consume full alphanumeric word | |
| std::string word; | |
| while (i < text.length() && std::isalnum(text[i])) { | |
| word += std::tolower(text[i]); | |
| i++; | |
| } | |
| tokens.push_back(word); | |
| } else { | |
| // Check if ASCII (high bit 0) | |
| if ((c & 0x80) == 0) { | |
| // It's ASCII but not alphanumeric (punctuation, space, etc.) | |
| // Skip it to match legacy behavior (and pass unit tests) | |
| i++; | |
| continue; | |
| } | |
| // Consume one UTF-8 character or symbol (high bit 1) | |
| int len = get_utf8_sequence_length(c); | |
| // Ensure we don't read past end | |
| if (i + len > text.length()) len = 1; | |
| std::string utf8_char(text.substr(i, len)); | |
| // Only add non-whitespace | |
| bool is_space = true; | |
| for (char b : utf8_char) { | |
| if (!std::isspace(b)) { | |
| is_space = false; | |
| break; | |
| } | |
| } | |
| if (!is_space) { | |
| tokens.push_back(utf8_char); | |
| } | |
| i += len; | |
| } | |
| } | |
| return tokens; | |
| } | |
| double SentimentAnalyzer::analyze(std::string_view text) const { | |
| auto tokens = tokenize(text); | |
| if (tokens.empty()) return 0.0; | |
| double score = 0.0; | |
| for (const auto& token : tokens) { | |
| if (lexicon.count(token)) { | |
| score += lexicon.at(token); | |
| } | |
| } | |
| // Normalize score roughly between -1 and 1 for typical short sentences | |
| // This is a simplified normalization logic | |
| double normalized_score = std::max(-1.0, std::min(1.0, score / (tokens.size() * 0.5 + 1))); | |
| return normalized_score; | |
| } | |
| // C Wrapper for ctypes | |
| extern "C" { | |
| SentimentAnalyzer* SentimentAnalyzer_create() { | |
| return new SentimentAnalyzer(); | |
| } | |
| double SentimentAnalyzer_analyze(SentimentAnalyzer* analyzer, const char* text) { | |
| if (analyzer && text) { | |
| return analyzer->analyze(text); | |
| } | |
| return 0.0; | |
| } | |
| void SentimentAnalyzer_destroy(SentimentAnalyzer* analyzer) { | |
| delete analyzer; | |
| } | |
| // Tokenization Wrappers | |
| typedef void* TokenListHandle; | |
| TokenListHandle SentimentAnalyzer_tokenize(SentimentAnalyzer* analyzer, const char* text) { | |
| if (!analyzer || !text) return nullptr; | |
| // Call C++ tokenize | |
| auto tokens = analyzer->tokenize(text); | |
| // Return a pointer to a new heap-allocated vector | |
| return new std::vector<std::string>(tokens); | |
| } | |
| int TokenList_get_size(TokenListHandle handle) { | |
| if (!handle) return 0; | |
| auto* tokens = static_cast<std::vector<std::string>*>(handle); | |
| return static_cast<int>(tokens->size()); | |
| } | |
| const char* TokenList_get_token(TokenListHandle handle, int index) { | |
| if (!handle) return nullptr; | |
| auto* tokens = static_cast<std::vector<std::string>*>(handle); | |
| if (index < 0 || index >= static_cast<int>(tokens->size())) return nullptr; | |
| // .c_str() is valid as long as the vector exists and isn't modified | |
| return (*tokens)[index].c_str(); | |
| } | |
| void TokenList_destroy(TokenListHandle handle) { | |
| if (handle) { | |
| delete static_cast<std::vector<std::string>*>(handle); | |
| } | |
| } | |
| } | |