Sync updates for CUDA 13 compat
Browse files- build.toml +2 -0
- compressed_tensors/int8_quant_kernels.cu +2 -1
- cub_helpers.h +17 -0
- fp8/common.cu +2 -1
build.toml
CHANGED
|
@@ -95,6 +95,7 @@ depends = ["torch"]
|
|
| 95 |
include = ["."]
|
| 96 |
src = [
|
| 97 |
"compressed_tensors/int8_quant_kernels.cu",
|
|
|
|
| 98 |
"dispatch_utils.h",
|
| 99 |
"vectorization_utils.cuh",
|
| 100 |
]
|
|
@@ -119,6 +120,7 @@ include = ["."]
|
|
| 119 |
src = [
|
| 120 |
"fp8/common.cu",
|
| 121 |
"fp8/common.cuh",
|
|
|
|
| 122 |
"dispatch_utils.h",
|
| 123 |
"utils.cuh",
|
| 124 |
"vectorization.cuh",
|
|
|
|
| 95 |
include = ["."]
|
| 96 |
src = [
|
| 97 |
"compressed_tensors/int8_quant_kernels.cu",
|
| 98 |
+
"cub_helpers.h",
|
| 99 |
"dispatch_utils.h",
|
| 100 |
"vectorization_utils.cuh",
|
| 101 |
]
|
|
|
|
| 120 |
src = [
|
| 121 |
"fp8/common.cu",
|
| 122 |
"fp8/common.cuh",
|
| 123 |
+
"cub_helpers.h",
|
| 124 |
"dispatch_utils.h",
|
| 125 |
"utils.cuh",
|
| 126 |
"vectorization.cuh",
|
compressed_tensors/int8_quant_kernels.cu
CHANGED
|
@@ -3,6 +3,7 @@
|
|
| 3 |
|
| 4 |
#include <cmath>
|
| 5 |
|
|
|
|
| 6 |
#include "../dispatch_utils.h"
|
| 7 |
#include "../vectorization_utils.cuh"
|
| 8 |
|
|
@@ -168,7 +169,7 @@ __global__ void dynamic_scaled_int8_quant_kernel(
|
|
| 168 |
}
|
| 169 |
using BlockReduce = cub::BlockReduce<float, 256>;
|
| 170 |
__shared__ typename BlockReduce::TempStorage tmp;
|
| 171 |
-
float block_max = BlockReduce(tmp).Reduce(thread_max,
|
| 172 |
__shared__ float absmax;
|
| 173 |
if (tid == 0) {
|
| 174 |
absmax = block_max;
|
|
|
|
| 3 |
|
| 4 |
#include <cmath>
|
| 5 |
|
| 6 |
+
#include "../cub_helpers.h"
|
| 7 |
#include "../dispatch_utils.h"
|
| 8 |
#include "../vectorization_utils.cuh"
|
| 9 |
|
|
|
|
| 169 |
}
|
| 170 |
using BlockReduce = cub::BlockReduce<float, 256>;
|
| 171 |
__shared__ typename BlockReduce::TempStorage tmp;
|
| 172 |
+
float block_max = BlockReduce(tmp).Reduce(thread_max, CubMaxOp{}, blockDim.x);
|
| 173 |
__shared__ float absmax;
|
| 174 |
if (tid == 0) {
|
| 175 |
absmax = block_max;
|
cub_helpers.h
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
|
| 3 |
+
#ifndef USE_ROCM
|
| 4 |
+
#include <cub/cub.cuh>
|
| 5 |
+
#if CUB_VERSION >= 200800
|
| 6 |
+
#include <cuda/std/functional>
|
| 7 |
+
using CubAddOp = cuda::std::plus<>;
|
| 8 |
+
using CubMaxOp = cuda::maximum<>;
|
| 9 |
+
#else // if CUB_VERSION < 200800
|
| 10 |
+
using CubAddOp = cub::Sum;
|
| 11 |
+
using CubMaxOp = cub::Max;
|
| 12 |
+
#endif // CUB_VERSION
|
| 13 |
+
#else
|
| 14 |
+
#include <hipcub/hipcub.hpp>
|
| 15 |
+
using CubAddOp = cub::Sum;
|
| 16 |
+
using CubMaxOp = cub::Max;
|
| 17 |
+
#endif // USE_ROCM
|
fp8/common.cu
CHANGED
|
@@ -1,5 +1,6 @@
|
|
| 1 |
#include "common.cuh"
|
| 2 |
#include "dispatch_utils.h"
|
|
|
|
| 3 |
|
| 4 |
#include <c10/cuda/CUDAGuard.h>
|
| 5 |
|
|
@@ -55,7 +56,7 @@ __global__ void dynamic_per_token_scaled_fp8_quant_kernel(
|
|
| 55 |
using BlockReduce = cub::BlockReduce<float, 256>;
|
| 56 |
__shared__ typename BlockReduce::TempStorage reduceStorage;
|
| 57 |
float const block_absmax_val_maybe =
|
| 58 |
-
BlockReduce(reduceStorage).Reduce(absmax_val,
|
| 59 |
__shared__ float token_scale;
|
| 60 |
if (tid == 0) {
|
| 61 |
if (scale_ub) {
|
|
|
|
| 1 |
#include "common.cuh"
|
| 2 |
#include "dispatch_utils.h"
|
| 3 |
+
#include "../cub_helpers.h"
|
| 4 |
|
| 5 |
#include <c10/cuda/CUDAGuard.h>
|
| 6 |
|
|
|
|
| 56 |
using BlockReduce = cub::BlockReduce<float, 256>;
|
| 57 |
__shared__ typename BlockReduce::TempStorage reduceStorage;
|
| 58 |
float const block_absmax_val_maybe =
|
| 59 |
+
BlockReduce(reduceStorage).Reduce(absmax_val, CubMaxOp{}, blockDim.x);
|
| 60 |
__shared__ float token_scale;
|
| 61 |
if (tid == 0) {
|
| 62 |
if (scale_ub) {
|