Spaces:

duqing2026
/

cpp-sentiment-analyzer

Sleeping

App Files Files Community

cpp-sentiment-analyzer / src /SentimentAnalyzer.cpp

duqing2026

开发互动界面

e8f9f2e about 1 month ago

raw

history blame contribute delete

5.71 kB

	#include "SentimentAnalyzer.hpp"
	#include <sstream>
	#include <algorithm>
	#include <cctype>

	SentimentAnalyzer::SentimentAnalyzer() {
	loadLexicon();
	}

	void SentimentAnalyzer::loadLexicon() {
	// Extended lexicon with English and Chinese sentiment words
	lexicon = {
	// English
	{"good", 1.0}, {"great", 1.5}, {"excellent", 2.0}, {"amazing", 2.0},
	{"happy", 1.0}, {"love", 1.5}, {"bad", -1.0}, {"terrible", -1.5},
	{"awful", -2.0}, {"hate", -1.5}, {"sad", -1.0}, {"worst", -2.0},

	// Chinese Positive
	{"好", 1.0}, {"棒", 1.5}, {"优秀", 2.0}, {"精彩", 2.0},
	{"开心", 1.0}, {"喜欢", 1.5}, {"不错", 1.2}, {"满意", 1.5},
	{"爱", 1.5}, {"快乐", 1.0}, {"推荐", 1.5}, {"给力", 1.5},

	// Chinese Negative
	{"坏", -1.0}, {"差", -1.0}, {"糟糕", -1.5}, {"讨厌", -1.5},
	{"难过", -1.0}, {"慢", -1.0}, {"拥堵", -1.0}, {"失望", -1.5},
	{"垃圾", -2.0}, {"恨", -1.5}, {"悲伤", -1.0}, {"愤怒", -1.5},

	// Chinese Complex/Mixed (handled as neutral or specific weights)
	{"复杂", -0.5} // "Complex" often implies mixed feelings
	};
	}

	// Helper to determine UTF-8 character length
	int get_utf8_sequence_length(char c) {
	if ((c & 0x80) == 0) return 1;
	if ((c & 0xE0) == 0xC0) return 2;
	if ((c & 0xF0) == 0xE0) return 3;
	if ((c & 0xF8) == 0xF0) return 4;
	return 1; // Fallback
	}

	std::vector<std::string> SentimentAnalyzer::tokenize(std::string_view text) const {
	std::vector<std::string> tokens;
	size_t i = 0;

	while (i < text.length()) {
	// 1. Check for dictionary match (Max Match)
	std::string best_match;
	for (const auto& [word, score] : lexicon) {
	if (text.substr(i).rfind(word, 0) == 0) { // Starts with word
	if (word.length() > best_match.length()) {
	best_match = word;
	}
	}
	}

	if (!best_match.empty()) {
	tokens.push_back(best_match);
	i += best_match.length();
	continue;
	}

	// 2. No dictionary match: Handle English/Numbers or Single UTF-8 Char
	char c = text[i];

	if (std::isalnum(c)) {
	// Consume full alphanumeric word
	std::string word;
	while (i < text.length() && std::isalnum(text[i])) {
	word += std::tolower(text[i]);
	i++;
	}
	tokens.push_back(word);
	} else {
	// Check if ASCII (high bit 0)
	if ((c & 0x80) == 0) {
	// It's ASCII but not alphanumeric (punctuation, space, etc.)
	// Skip it to match legacy behavior (and pass unit tests)
	i++;
	continue;
	}

	// Consume one UTF-8 character or symbol (high bit 1)
	int len = get_utf8_sequence_length(c);
	// Ensure we don't read past end
	if (i + len > text.length()) len = 1;

	std::string utf8_char(text.substr(i, len));
	// Only add non-whitespace
	bool is_space = true;
	for (char b : utf8_char) {
	if (!std::isspace(b)) {
	is_space = false;
	break;
	}
	}

	if (!is_space) {
	tokens.push_back(utf8_char);
	}
	i += len;
	}
	}

	return tokens;
	}

	double SentimentAnalyzer::analyze(std::string_view text) const {
	auto tokens = tokenize(text);
	if (tokens.empty()) return 0.0;

	double score = 0.0;
	for (const auto& token : tokens) {
	if (lexicon.count(token)) {
	score += lexicon.at(token);
	}
	}

	// Normalize score roughly between -1 and 1 for typical short sentences
	// This is a simplified normalization logic
	double normalized_score = std::max(-1.0, std::min(1.0, score / (tokens.size() * 0.5 + 1)));

	return normalized_score;
	}

	// C Wrapper for ctypes
	extern "C" {
	SentimentAnalyzer* SentimentAnalyzer_create() {
	return new SentimentAnalyzer();
	}

	double SentimentAnalyzer_analyze(SentimentAnalyzer* analyzer, const char* text) {
	if (analyzer && text) {
	return analyzer->analyze(text);
	}
	return 0.0;
	}

	void SentimentAnalyzer_destroy(SentimentAnalyzer* analyzer) {
	delete analyzer;
	}

	// Tokenization Wrappers
	typedef void* TokenListHandle;

	TokenListHandle SentimentAnalyzer_tokenize(SentimentAnalyzer* analyzer, const char* text) {
	if (!analyzer \|\| !text) return nullptr;
	// Call C++ tokenize
	auto tokens = analyzer->tokenize(text);
	// Return a pointer to a new heap-allocated vector
	return new std::vector<std::string>(tokens);
	}

	int TokenList_get_size(TokenListHandle handle) {
	if (!handle) return 0;
	auto* tokens = static_cast<std::vector<std::string>*>(handle);
	return static_cast<int>(tokens->size());
	}

	const char* TokenList_get_token(TokenListHandle handle, int index) {
	if (!handle) return nullptr;
	auto* tokens = static_cast<std::vector<std::string>*>(handle);
	if (index < 0 \|\| index >= static_cast<int>(tokens->size())) return nullptr;
	// .c_str() is valid as long as the vector exists and isn't modified
	return (*tokens)[index].c_str();
	}

	void TokenList_destroy(TokenListHandle handle) {
	if (handle) {
	delete static_cast<std::vector<std::string>*>(handle);
	}
	}
	}