cpp-sentiment-analyzer / src /SentimentAnalyzer.cpp
duqing2026's picture
开发互动界面
e8f9f2e
#include "SentimentAnalyzer.hpp"
#include <sstream>
#include <algorithm>
#include <cctype>
SentimentAnalyzer::SentimentAnalyzer() {
loadLexicon();
}
void SentimentAnalyzer::loadLexicon() {
// Extended lexicon with English and Chinese sentiment words
lexicon = {
// English
{"good", 1.0}, {"great", 1.5}, {"excellent", 2.0}, {"amazing", 2.0},
{"happy", 1.0}, {"love", 1.5}, {"bad", -1.0}, {"terrible", -1.5},
{"awful", -2.0}, {"hate", -1.5}, {"sad", -1.0}, {"worst", -2.0},
// Chinese Positive
{"好", 1.0}, {"棒", 1.5}, {"优秀", 2.0}, {"精彩", 2.0},
{"开心", 1.0}, {"喜欢", 1.5}, {"不错", 1.2}, {"满意", 1.5},
{"爱", 1.5}, {"快乐", 1.0}, {"推荐", 1.5}, {"给力", 1.5},
// Chinese Negative
{"坏", -1.0}, {"差", -1.0}, {"糟糕", -1.5}, {"讨厌", -1.5},
{"难过", -1.0}, {"慢", -1.0}, {"拥堵", -1.0}, {"失望", -1.5},
{"垃圾", -2.0}, {"恨", -1.5}, {"悲伤", -1.0}, {"愤怒", -1.5},
// Chinese Complex/Mixed (handled as neutral or specific weights)
{"复杂", -0.5} // "Complex" often implies mixed feelings
};
}
// Helper to determine UTF-8 character length
int get_utf8_sequence_length(char c) {
if ((c & 0x80) == 0) return 1;
if ((c & 0xE0) == 0xC0) return 2;
if ((c & 0xF0) == 0xE0) return 3;
if ((c & 0xF8) == 0xF0) return 4;
return 1; // Fallback
}
std::vector<std::string> SentimentAnalyzer::tokenize(std::string_view text) const {
std::vector<std::string> tokens;
size_t i = 0;
while (i < text.length()) {
// 1. Check for dictionary match (Max Match)
std::string best_match;
for (const auto& [word, score] : lexicon) {
if (text.substr(i).rfind(word, 0) == 0) { // Starts with word
if (word.length() > best_match.length()) {
best_match = word;
}
}
}
if (!best_match.empty()) {
tokens.push_back(best_match);
i += best_match.length();
continue;
}
// 2. No dictionary match: Handle English/Numbers or Single UTF-8 Char
char c = text[i];
if (std::isalnum(c)) {
// Consume full alphanumeric word
std::string word;
while (i < text.length() && std::isalnum(text[i])) {
word += std::tolower(text[i]);
i++;
}
tokens.push_back(word);
} else {
// Check if ASCII (high bit 0)
if ((c & 0x80) == 0) {
// It's ASCII but not alphanumeric (punctuation, space, etc.)
// Skip it to match legacy behavior (and pass unit tests)
i++;
continue;
}
// Consume one UTF-8 character or symbol (high bit 1)
int len = get_utf8_sequence_length(c);
// Ensure we don't read past end
if (i + len > text.length()) len = 1;
std::string utf8_char(text.substr(i, len));
// Only add non-whitespace
bool is_space = true;
for (char b : utf8_char) {
if (!std::isspace(b)) {
is_space = false;
break;
}
}
if (!is_space) {
tokens.push_back(utf8_char);
}
i += len;
}
}
return tokens;
}
double SentimentAnalyzer::analyze(std::string_view text) const {
auto tokens = tokenize(text);
if (tokens.empty()) return 0.0;
double score = 0.0;
for (const auto& token : tokens) {
if (lexicon.count(token)) {
score += lexicon.at(token);
}
}
// Normalize score roughly between -1 and 1 for typical short sentences
// This is a simplified normalization logic
double normalized_score = std::max(-1.0, std::min(1.0, score / (tokens.size() * 0.5 + 1)));
return normalized_score;
}
// C Wrapper for ctypes
extern "C" {
SentimentAnalyzer* SentimentAnalyzer_create() {
return new SentimentAnalyzer();
}
double SentimentAnalyzer_analyze(SentimentAnalyzer* analyzer, const char* text) {
if (analyzer && text) {
return analyzer->analyze(text);
}
return 0.0;
}
void SentimentAnalyzer_destroy(SentimentAnalyzer* analyzer) {
delete analyzer;
}
// Tokenization Wrappers
typedef void* TokenListHandle;
TokenListHandle SentimentAnalyzer_tokenize(SentimentAnalyzer* analyzer, const char* text) {
if (!analyzer || !text) return nullptr;
// Call C++ tokenize
auto tokens = analyzer->tokenize(text);
// Return a pointer to a new heap-allocated vector
return new std::vector<std::string>(tokens);
}
int TokenList_get_size(TokenListHandle handle) {
if (!handle) return 0;
auto* tokens = static_cast<std::vector<std::string>*>(handle);
return static_cast<int>(tokens->size());
}
const char* TokenList_get_token(TokenListHandle handle, int index) {
if (!handle) return nullptr;
auto* tokens = static_cast<std::vector<std::string>*>(handle);
if (index < 0 || index >= static_cast<int>(tokens->size())) return nullptr;
// .c_str() is valid as long as the vector exists and isn't modified
return (*tokens)[index].c_str();
}
void TokenList_destroy(TokenListHandle handle) {
if (handle) {
delete static_cast<std::vector<std::string>*>(handle);
}
}
}