Spaces:

KernelPilot
/

KernelPilot-Optimization

Sleeping

App Files Files Community

KernelPilot-Optimization / tool /input.txt

wentaochen

v1 init

949310d 4 months ago

raw

history blame contribute delete

2.76 kB

	#include <iostream>
	#include <cuda_runtime.h>
	#include <fstream>
	#include <cmath>
	#include <vector>
	#include <string>

	__global__ void mseKernel(const float* predictions, const float* targets, size_t numElements, float* sum) {
	size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
	if (idx < numElements) {
	float diff = predictions[idx] - targets[idx];
	float sq_diff = diff * diff;
	atomicAdd(sum, sq_diff);
	}
	}

	void read_binary(const std::string& filename, float* data, size_t size) {
	std::ifstream in(filename, std::ios::binary);
	if (!in) {
	std::cerr << "Cannot open file: " << filename << std::endl;
	exit(1);
	}
	in.read(reinterpret_cast<char>(data), size sizeof(float));
	in.close();
	}

	// test
	bool compare_scalar(float a, float b, float tol = 1e-1f) {
	return std::fabs(a - b) < tol;
	}

	int main() {
	std::vector<size_t> sizes = {1 << 10, 1 << 12, 1 << 14, 1 << 16, 1 << 18};
	bool all_passed = true;

	for (int t = 0; t < sizes.size(); ++t) {
	size_t N = sizes[t];
	size_t input_size = N * sizeof(float);

	// test
	std::string pred_file = "data/mse_preds_" + std::to_string(t + 1) + ".bin";
	std::string target_file = "data/mse_targets_" + std::to_string(t + 1) + ".bin";
	std::string ref_file = "data/mse_ref_" + std::to_string(t + 1) + ".bin";

	float* h_preds = (float*)malloc(input_size);
	float* h_targets = (float*)malloc(input_size);
	float h_mse_ref;

	read_binary(pred_file, h_preds, N);
	read_binary(target_file, h_targets, N);
	read_binary(ref_file, &h_mse_ref, 1);

	float d_preds, d_targets, *d_sum;
	cudaMalloc(&d_preds, input_size);
	cudaMalloc(&d_targets, input_size);
	cudaMalloc(&d_sum, sizeof(float));
	cudaMemcpy(d_preds, h_preds, input_size, cudaMemcpyHostToDevice);
	cudaMemcpy(d_targets, h_targets, input_size, cudaMemcpyHostToDevice);
	cudaMemset(d_sum, 0, sizeof(float));

	int threads = 256;
	int blocks = (N + threads - 1) / threads;
	mseKernel<<<blocks, threads>>>(d_preds, d_targets, N, d_sum);

	float h_sum = 0.0f;
	cudaMemcpy(&h_sum, d_sum, sizeof(float), cudaMemcpyDeviceToHost);
	float mse = h_sum / N;

	if (!compare_scalar(mse, h_mse_ref)) {
	std::cout << "F" << std::endl;
	all_passed = false;

	cudaFree(d_preds); cudaFree(d_targets); cudaFree(d_sum);
	free(h_preds); free(h_targets);
	break;
	}

	cudaFree(d_preds); cudaFree(d_targets); cudaFree(d_sum);
	free(h_preds); free(h_targets);
	}

	if (all_passed) std::cout << "T" << std::endl;
	return 0;
	}