danieldk HF Staff commited on
Commit
70da85f
·
1 Parent(s): 7c5923b

Sync updates for CUDA 13 compat

Browse files
build.toml CHANGED
@@ -95,6 +95,7 @@ depends = ["torch"]
95
  include = ["."]
96
  src = [
97
  "compressed_tensors/int8_quant_kernels.cu",
 
98
  "dispatch_utils.h",
99
  "vectorization_utils.cuh",
100
  ]
@@ -119,6 +120,7 @@ include = ["."]
119
  src = [
120
  "fp8/common.cu",
121
  "fp8/common.cuh",
 
122
  "dispatch_utils.h",
123
  "utils.cuh",
124
  "vectorization.cuh",
 
95
  include = ["."]
96
  src = [
97
  "compressed_tensors/int8_quant_kernels.cu",
98
+ "cub_helpers.h",
99
  "dispatch_utils.h",
100
  "vectorization_utils.cuh",
101
  ]
 
120
  src = [
121
  "fp8/common.cu",
122
  "fp8/common.cuh",
123
+ "cub_helpers.h",
124
  "dispatch_utils.h",
125
  "utils.cuh",
126
  "vectorization.cuh",
compressed_tensors/int8_quant_kernels.cu CHANGED
@@ -3,6 +3,7 @@
3
 
4
  #include <cmath>
5
 
 
6
  #include "../dispatch_utils.h"
7
  #include "../vectorization_utils.cuh"
8
 
@@ -168,7 +169,7 @@ __global__ void dynamic_scaled_int8_quant_kernel(
168
  }
169
  using BlockReduce = cub::BlockReduce<float, 256>;
170
  __shared__ typename BlockReduce::TempStorage tmp;
171
- float block_max = BlockReduce(tmp).Reduce(thread_max, cub::Max{}, blockDim.x);
172
  __shared__ float absmax;
173
  if (tid == 0) {
174
  absmax = block_max;
 
3
 
4
  #include <cmath>
5
 
6
+ #include "../cub_helpers.h"
7
  #include "../dispatch_utils.h"
8
  #include "../vectorization_utils.cuh"
9
 
 
169
  }
170
  using BlockReduce = cub::BlockReduce<float, 256>;
171
  __shared__ typename BlockReduce::TempStorage tmp;
172
+ float block_max = BlockReduce(tmp).Reduce(thread_max, CubMaxOp{}, blockDim.x);
173
  __shared__ float absmax;
174
  if (tid == 0) {
175
  absmax = block_max;
cub_helpers.h ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+
3
+ #ifndef USE_ROCM
4
+ #include <cub/cub.cuh>
5
+ #if CUB_VERSION >= 200800
6
+ #include <cuda/std/functional>
7
+ using CubAddOp = cuda::std::plus<>;
8
+ using CubMaxOp = cuda::maximum<>;
9
+ #else // if CUB_VERSION < 200800
10
+ using CubAddOp = cub::Sum;
11
+ using CubMaxOp = cub::Max;
12
+ #endif // CUB_VERSION
13
+ #else
14
+ #include <hipcub/hipcub.hpp>
15
+ using CubAddOp = cub::Sum;
16
+ using CubMaxOp = cub::Max;
17
+ #endif // USE_ROCM
fp8/common.cu CHANGED
@@ -1,5 +1,6 @@
1
  #include "common.cuh"
2
  #include "dispatch_utils.h"
 
3
 
4
  #include <c10/cuda/CUDAGuard.h>
5
 
@@ -55,7 +56,7 @@ __global__ void dynamic_per_token_scaled_fp8_quant_kernel(
55
  using BlockReduce = cub::BlockReduce<float, 256>;
56
  __shared__ typename BlockReduce::TempStorage reduceStorage;
57
  float const block_absmax_val_maybe =
58
- BlockReduce(reduceStorage).Reduce(absmax_val, cub::Max{}, blockDim.x);
59
  __shared__ float token_scale;
60
  if (tid == 0) {
61
  if (scale_ub) {
 
1
  #include "common.cuh"
2
  #include "dispatch_utils.h"
3
+ #include "../cub_helpers.h"
4
 
5
  #include <c10/cuda/CUDAGuard.h>
6
 
 
56
  using BlockReduce = cub::BlockReduce<float, 256>;
57
  __shared__ typename BlockReduce::TempStorage reduceStorage;
58
  float const block_absmax_val_maybe =
59
+ BlockReduce(reduceStorage).Reduce(absmax_val, CubMaxOp{}, blockDim.x);
60
  __shared__ float token_scale;
61
  if (tid == 0) {
62
  if (scale_ub) {