| #include <iostream> |
| #include <cuda_runtime.h> |
| #include <fstream> |
| #include <cmath> |
| #include <vector> |
| #include <string> |
|
|
| __global__ void mseKernel(const float* predictions, const float* targets, size_t numElements, float* sum) { |
| size_t idx = blockIdx.x * blockDim.x + threadIdx.x; |
| if (idx < numElements) { |
| float diff = predictions[idx] - targets[idx]; |
| float sq_diff = diff * diff; |
| atomicAdd(sum, sq_diff); |
| } |
| } |
|
|
| void read_binary(const std::string& filename, float* data, size_t size) { |
| std::ifstream in(filename, std::ios::binary); |
| if (!in) { |
| std::cerr << "Cannot open file: " << filename << std::endl; |
| exit(1); |
| } |
| in.read(reinterpret_cast<char*>(data), size * sizeof(float)); |
| in.close(); |
| } |
|
|
| // test |
| bool compare_scalar(float a, float b, float tol = 1e-1f) { |
| return std::fabs(a - b) < tol; |
| } |
|
|
| int main() { |
| std::vector<size_t> sizes = {1 << 10, 1 << 12, 1 << 14, 1 << 16, 1 << 18}; |
| bool all_passed = true; |
|
|
| for (int t = 0; t < sizes.size(); ++t) { |
| size_t N = sizes[t]; |
| size_t input_size = N * sizeof(float); |
| |
| // test |
| std::string pred_file = "data/mse_preds_" + std::to_string(t + 1) + ".bin"; |
| std::string target_file = "data/mse_targets_" + std::to_string(t + 1) + ".bin"; |
| std::string ref_file = "data/mse_ref_" + std::to_string(t + 1) + ".bin"; |
|
|
| float* h_preds = (float*)malloc(input_size); |
| float* h_targets = (float*)malloc(input_size); |
| float h_mse_ref; |
|
|
| read_binary(pred_file, h_preds, N); |
| read_binary(target_file, h_targets, N); |
| read_binary(ref_file, &h_mse_ref, 1); |
|
|
| float *d_preds, *d_targets, *d_sum; |
| cudaMalloc(&d_preds, input_size); |
| cudaMalloc(&d_targets, input_size); |
| cudaMalloc(&d_sum, sizeof(float)); |
| cudaMemcpy(d_preds, h_preds, input_size, cudaMemcpyHostToDevice); |
| cudaMemcpy(d_targets, h_targets, input_size, cudaMemcpyHostToDevice); |
| cudaMemset(d_sum, 0, sizeof(float)); |
|
|
| int threads = 256; |
| int blocks = (N + threads - 1) / threads; |
| mseKernel<<<blocks, threads>>>(d_preds, d_targets, N, d_sum); |
|
|
| float h_sum = 0.0f; |
| cudaMemcpy(&h_sum, d_sum, sizeof(float), cudaMemcpyDeviceToHost); |
| float mse = h_sum / N; |
|
|
| if (!compare_scalar(mse, h_mse_ref)) { |
| std::cout << "F" << std::endl; |
| all_passed = false; |
|
|
| cudaFree(d_preds); cudaFree(d_targets); cudaFree(d_sum); |
| free(h_preds); free(h_targets); |
| break; |
| } |
|
|
| cudaFree(d_preds); cudaFree(d_targets); cudaFree(d_sum); |
| free(h_preds); free(h_targets); |
| } |
|
|
| if (all_passed) std::cout << "T" << std::endl; |
| return 0; |
| } |
|
|