Here are my codes, it’s a gemm function modified from npbench. I use nsys with nsys profile -o gemm-tf32-fp32-ray --force-overwrite=true --trace=cuda,cudnn,cublas,osrt,nvtx python run.py --preset L
. And I found that not always get this error message, sometimes when I modify a few lines of codes, the message disappear.
#file name: run.py
import ray
import time
import argparse
import cupy as np
from jacobi_2d_cupy import jacobi_2d
from cavity_flow_cupy import cavity_flow
from conv2d_cupy import conv2d_bias
from cholesky2_cupy import cholesky2
from gemm_cupy import gemm_fp32,gemm_tf32,gemm_fp64
from ray.experimental.state.api import summarize_tasks
from cupy.cuda.nvtx import RangePush, RangePop
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--preset",
type=str,
default="M",
help="Set size of the problems.",
)
args, _ = parser.parse_known_args()
datatype = np.float32
ray.init()
print("sending task...")
results = []
start = time.time()
RangePush("Nested Powers of A")
for i in range(1):
results.append(gemm_fp32.remote(args.preset))
results.append(gemm_tf32.remote(args.preset))
RangePop()
result = ray.get(results)
print("total time1 =", time.time() - start)
for i in range(len(result)):
print(result[i])
ray.timeline(filename="./timeline/timeline-gemm-tf32-fp32.json")
#file name: gemm_cupy.py
import cupy as np
import ray
import time
import os
from cupy.cuda.nvtx import RangePush, RangePop
# "S": { "NI": 1000, "NJ": 1100, "NK": 1200 },
# "M": { "NI": 2500, "NJ": 2750, "NK": 3000 },
# "L": { "NI": 7000, "NJ": 7500, "NK": 8000 },
# "paper": { "NI": 2000, "NJ": 2300, "NK": 2600 }
def initialize(NI, NJ, NK, datatype=np.float64):
alpha = datatype(1.5)
beta = datatype(1.2)
C = np.fromfunction(lambda i, j: ((i * j + 1) % NI) / NI, (NI, NJ),
dtype=datatype)
A = np.fromfunction(lambda i, k: (i * (k + 1) % NK) / NK, (NI, NK),
dtype=datatype)
B = np.fromfunction(lambda k, j: (k * (j + 2) % NJ) / NJ, (NK, NJ),
dtype=datatype)
return alpha, beta, C, A, B
@ray.remote(num_gpus=0.1)
def gemm_fp32(preset = "S"):
os.environ['CUPY_TF32']="0"
datatype = np.float32
if(preset =="S"):
NI = 1000
NJ = 1100
NK = 1200
elif(preset == "M"):
NI = 2500
NJ = 2750
NK = 3000
elif(preset == "L"):
NI = 7000
NJ = 7500
NK = 8000
elif(preset == "U"):
NI = 7000*2
NJ = 7500*2
NK = 8000
elif(preset == "paper"):
NI = 2000
NJ = 2300
NK = 2600
# alpha,beta,C,A,B = initialize(NI,NJ,NK,datatype)
alpha = datatype(1.5)
beta = datatype(1.2)
C = np.fromfunction(lambda i, j: ((i * j + 1) % NI) / NI, (NI, NJ),
dtype=datatype)
A = np.fromfunction(lambda i, k: (i * (k + 1) % NK) / NK, (NI, NK),
dtype=datatype)
B = np.fromfunction(lambda k, j: (k * (j + 2) % NJ) / NJ, (NK, NJ),
dtype=datatype)
RangePush("FP32")
stream = np.cuda.Stream(non_blocking=True)
with stream:
startEvent = stream.record()
for _ in range(100):
C[:] = alpha * A @ B + beta * C
endEvent = stream.record()
stream.synchronize()
RangePop()
total = np.cuda.get_elapsed_time(startEvent,endEvent)
result = ["fp32",total/1000]
return result
@ray.remote(num_gpus=0.1)
def gemm_tf32(preset = "S"):
datatype = np.float32
os.environ['CUPY_TF32']="1"
if(preset =="S"):
NI = 1000
NJ = 1100
NK = 1200
elif(preset == "M"):
NI = 2500
NJ = 2750
NK = 3000
elif(preset == "L"):
NI = 7000
NJ = 7500
NK = 8000
elif(preset == "U"):
NI = 7000*2
NJ = 7500*2
NK = 8000
elif(preset == "paper"):
NI = 2000
NJ = 2300
NK = 2600
# alpha,beta,C,A,B = initialize(NI,NJ,NK,datatype)
alpha = datatype(1.5)
beta = datatype(1.2)
C = np.fromfunction(lambda i, j: ((i * j + 1) % NI) / NI, (NI, NJ),
dtype=datatype)
A = np.fromfunction(lambda i, k: (i * (k + 1) % NK) / NK, (NI, NK),
dtype=datatype)
B = np.fromfunction(lambda k, j: (k * (j + 2) % NJ) / NJ, (NK, NJ),
dtype=datatype)
RangePush("TF32")
stream = np.cuda.Stream(non_blocking=True)
# print("tf stream: ",np.cuda.get_current_stream())
start = time.time()
with stream:
startEvent = stream.record()
for _ in range(100):
C[:] = alpha * A @ B + beta * C
endEvent = stream.record()
stream.synchronize()
RangePop()
total = np.cuda.get_elapsed_time(startEvent,endEvent)
# total = time.time()-start
# with cp.cuda.Device(0):
# g_A = cp.asarray(A)
# g_B = cp.asarray(B)
# g_C = cp.asarray(C)
# g_C[:] = g_alpha * g_A @ g_B + g_beta * g_C
result = ["tf32",total/1000]
return result
@ray.remote(num_gpus=0.5)
def gemm_fp64(preset = "S"):
datatype = np.float64
os.environ['CUPY_TF32']="0"
if(preset =="S"):
NI = 1000
NJ = 1100
NK = 1200
elif(preset == "M"):
NI = 2500
NJ = 2750
NK = 3000
elif(preset == "L"):
NI = 7000
NJ = 7500
NK = 8000
elif(preset == "U"):
NI = 7000*2
NJ = 7500*2
NK = 8000
elif(preset == "paper"):
NI = 2000
NJ = 2300
NK = 2600
# alpha,beta,C,A,B = initialize(NI,NJ,NK,datatype)
alpha = datatype(1.5)
beta = datatype(1.2)
C = np.fromfunction(lambda i, j: ((i * j + 1) % NI) / NI, (NI, NJ),
dtype=datatype)
A = np.fromfunction(lambda i, k: (i * (k + 1) % NK) / NK, (NI, NK),
dtype=datatype)
B = np.fromfunction(lambda k, j: (k * (j + 2) % NJ) / NJ, (NK, NJ),
dtype=datatype)
start = time.time()
for _ in range(20):
C[:] = alpha * A @ B + beta * C
total = time.time()-start
# with cp.cuda.Device(0):
# g_A = cp.asarray(A)
# g_B = cp.asarray(B)
# g_C = cp.asarray(C)
# g_C[:] = g_alpha * g_A @ g_B + g_beta * g_C
result = ["fp64",total]
return result