Tag Archives: pytorch

[Solved] TVM operate error: TVMError: AssertionError

TVM operate error: TVMError: AssertionError

Traceback (most recent call last):
  File "tune_relay_x86.py", line 248, in <module>
    tune_and_evaluate(tuning_option)
  File "tune_relay_x86.py", line 241, in tune_and_evaluate
    lib = relay.build_module.build(mod, target=target, params=params)
  File "/home/lizhenxu/ZJJ/tvm_0.9/tvm/python/tvm/relay/build_module.py", line 468, in build
    graph_json, runtime_mod, params = bld_mod.build(
  File "/home/lizhenxu/ZJJ/tvm_0.9/tvm/python/tvm/relay/build_module.py", line 196, in build
    self._build(mod, target, target_host, executor, runtime, workspace_memory_pools, mod_name)
  File "/home/lizhenxu/ZJJ/tvm_0.9/tvm/python/tvm/_ffi/_ctypes/packed_func.py", line 237, in __call__
    raise get_last_ffi_error()
tvm._ffi.base.TVMError: Traceback (most recent call last):
  30: TVMFuncCall
  29: tvm::relay::backend::RelayBuildModule::GetFunction(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, tvm::runtime::ObjectPtr<tvm::runtime::Object> const&)::{lambda(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*)#3}::operator()(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
  28: tvm::relay::backend::RelayBuildModule::BuildRelay(tvm::IRModule, tvm::runtime::String const&)
  27: tvm::relay::backend::RelayBuildModule::OptimizeImpl(tvm::IRModule)
  26: tvm::transform::Pass::operator()(tvm::IRModule) const
  25: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
  24: tvm::transform::SequentialNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
  23: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
  22: tvm::relay::transform::FunctionPassNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
  21: _ZN3tvm7runtime13PackedFuncObj9ExtractorINS0_16PackedFuncSubObjIZNS0_15TypedPackedFuncIFNS_5relay8FunctionES6_NS_8IRModuleENS_9transform11PassContextEEE17AssignTypedLambdaIZNS5_9transform13AlterOpLayoutEvEUlS6_S7_S9_E_EEvT_EUlRKNS0_7TVMArgsEPNS0_11TVMRetValueEE_EEE4CallEPKS1_SG_SK_
  20: tvm::relay::alter_op_layout::AlterOpLayout(tvm::RelayExpr const&)
  19: tvm::relay::ForwardRewrite(tvm::RelayExpr const&, tvm::runtime::TypedPackedFunc<tvm::RelayExpr (tvm::relay::Call const&, tvm::runtime::Array<tvm::RelayExpr, void> const&, tvm::runtime::ObjectRef const&)> const&, std::function<tvm::runtime::ObjectRef (tvm::relay::Call const&)>, std::function<tvm::RelayExpr (tvm::RelayExpr const&)>)
  18: tvm::relay::MixedModeMutator::VisitExpr(tvm::RelayExpr const&)
  17: tvm::relay::MixedModeMutator::VisitLeaf(tvm::RelayExpr const&)
  16: _ZN3tvm5relay16MixedModeMutator17DispatchVisitExprERKNS_9Re
  15: tvm::relay::ExprMutator::VisitExpr(tvm::RelayExpr const&)
  14: tvm::relay::ExprFunctor<tvm::RelayExpr (tvm::RelayExpr const&)>::VisitExpr(tvm::RelayExpr const&)
  13: _ZZN3tvm5relay11ExprFunctorIFNS_9RelayExprERKS2_EE10InitVTableEvENUlR
  12: tvm::relay::ExprMutator::VisitExpr_(tvm::relay::FunctionNode const*)
  11: tvm::relay::MixedModeMutator::VisitExpr(tvm::RelayExpr const&)
  10: tvm::relay::MixedModeMutator::VisitLeaf(tvm::RelayExpr const&)
  9: _ZN3tvm5relay16MixedModeMutator17DispatchVisitExprERKNS_9Re
  8: tvm::relay::ExprMutator::VisitExpr(tvm::RelayExpr const&)
  7: tvm::relay::ExprFunctor<tvm::RelayExpr (tvm::RelayExpr const&)>::VisitExpr(tvm::RelayExpr const&)
  6: _ZZN3tvm5relay11ExprFunctorIFNS_9RelayExprERKS2_EE10InitVTableEvENUlR
  5: tvm::relay::MixedModeMutator::VisitExpr_(tvm::relay::CallNode const*)
  4: tvm::relay::ForwardRewriter::Rewrite_(tvm::relay::CallNode const*, tvm::RelayExpr const&)
  3: tvm::runtime::PackedFuncObj::Extractor<tvm::runtime::PackedFuncSubObj<tvm::runtime::TypedPackedFunc<tvm::RelayExpr (tvm::relay::Call const&, tvm::runtime::Array<tvm::RelayExpr, void> const&, tvm::runtime::ObjectRef const&)>::AssignTypedLambda<tvm::RelayExpr (*)(tvm::relay::Call const&, tvm::runtime::Array<tvm::RelayExpr, void> const&, tvm::runtime::ObjectRef const&)>(tvm::RelayExpr (*)(tvm::relay::Call const&, tvm::runtime::Array<tvm::RelayExpr, void> const&, tvm::runtime::ObjectRef const&))::{lambda(tvm::runtime::TVMArgs const&, tvm::runtime::TVMRetValue*)#1}> >::Call(tvm::runtime::PackedFuncObj const*, tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*)
  2: tvm::RelayExpr tvm::relay::LayoutRewriter<tvm::relay::alter_op_layout::AlterTransformMemorizer>(tvm::relay::Call const&, tvm::runtime::Array<tvm::RelayExpr, void> const&, tvm::runtime::ObjectRef const&)
  1: tvm::relay::alter_op_layout::AlterTransformMemorizerNode::CallWithNewLayouts(tvm::relay::Call const&, tvm::Attrs, std::vector<tvm::RelayExpr, std::allocator<tvm::RelayExpr> > const&)
  0: tvm::runtime::PackedFuncObj::Extractor<tvm::runtime::PackedFuncSubObj<TVMFuncCreateFromCFunc::{lambda(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*)#2}> >::Call(tvm::runtime::PackedFuncObj const*, tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*)
  File "/home/lizhenxu/ZJJ/tvm_0.9/tvm/python/tvm/_ffi/_ctypes/packed_func.py", line 81, in cfun
    rv = local_pyfunc(*pyargs)
  File "/home/lizhenxu/ZJJ/tvm_0.9/tvm/python/tvm/relay/op/nn/_nn.py", line 112, in alter_op_layout_dense
    return topi.nn.dense_alter_layout(attrs, inputs, tinfos, out_type)
  File "/home/lizhenxu/anaconda3/lib/python3.8/site-packages/decorator.py", line 231, in fun
    return caller(func, *(extras + args), **kw)
  File "/home/lizhenxu/ZJJ/tvm_0.9/tvm/python/tvm/target/generic_func.py", line 286, in dispatch_func
    return dispatch_dict[k](*args, **kwargs)
  File "/home/lizhenxu/ZJJ/tvm_0.9/tvm/python/tvm/topi/x86/dense_alter_op.py", line 51, in _alter_dense_layout
    _, outs = relay.backend.te_compiler.select_implementation(
  File "/home/lizhenxu/ZJJ/tvm_0.9/tvm/python/tvm/relay/backend/te_compiler.py", line 201, in select_implementation
    outs = impl.compute(attrs, inputs, out_type)
  File "/home/lizhenxu/ZJJ/tvm_0.9/tvm/python/tvm/relay/op/op.py", line 126, in compute
    return _OpImplementationCompute(self, attrs, inputs, out_type)
  File "/home/lizhenxu/ZJJ/tvm_0.9/tvm/python/tvm/_ffi/_ctypes/packed_func.py", line 237, in __call__
    raise get_last_ffi_error()
  3: TVMFuncCall
  2: tvm::runtime::PackedFuncObj::Extractor<tvm::runtime::PackedFuncSubObj<tvm::relay::{lambda(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*)#4}> >::Call(tvm::runtime::PackedFuncObj const*, tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*)
  1: tvm::relay::OpImplementation::Compute(tvm::Attrs const&, tvm::runtime::Array<tvm::te::Tensor, void> const&, tvm::Type const&)
  0: tvm::runtime::PackedFuncObj::Extractor<tvm::runtime::PackedFuncSubObj<TVMFuncCreateFromCFunc::{lambda(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*)#2}> >::Call(tvm::runtime::PackedFuncObj const*, tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*)
  File "/home/lizhenxu/ZJJ/tvm_0.9/tvm/python/tvm/_ffi/_ctypes/packed_func.py", line 81, in cfun
    rv = local_pyfunc(*pyargs)
  File "/home/lizhenxu/ZJJ/tvm_0.9/tvm/python/tvm/relay/op/strategy/generic.py", line 833, in _compute_dense
    return [topi_compute(*args)]
  File "/home/lizhenxu/ZJJ/tvm_0.9/tvm/python/tvm/autotvm/task/topi_integration.py", line 164, in wrapper
    cfg = DispatchContext.current.query(tgt, workload)
  File "/home/lizhenxu/ZJJ/tvm_0.9/tvm/python/tvm/autotvm/task/dispatcher.py", line 76, in query
    ret = self._query_inside(target, workload)
  File "/home/lizhenxu/ZJJ/tvm_0.9/tvm/python/tvm/autotvm/task/dispatcher.py", line 421, in _query_inside
    assert wkl == workload
TVMError: AssertionError

Run the official website code tune_relay_x86.py report the error above.

Solution: delete the .log file that was generated by running this code before

[DL Common Issue] RuntimeError: CUDA error 59: Device-side assert triggered

Problem: Run into a CUDA error during training
Solutions:
This error occurs due to the following two reasons:

    Inconsistency between the number of labels/classes and the number of output unitsThe input of the loss function may be incorrect

In my case the error occurs as the loss function is not correctly chosen : change to nn.BCELoss() from nn.CrossEntropyLoss()
Reference: https://towardsdatascience.com/cuda-error-device-side-assert-triggered-c6ae1c8fa4c3

[Solved] AttributeError: ‘_IncompatibleKeys‘ object has no attribute ‘parameters‘

Errors are reported when running the pytorch program. It should be a problem with the torch syntax.

Original code:

model=CNN()#CNN as a self-compiling neural network model
best_model_wts = copy.deepcopy(model.state_dict())
*************************************#(the error codes is below)
model=model.load_state_dict(best_model_wts)

Error message:

Modified code:

model=CNN()#CNN as a self-compiling neural network model
best_model_wts = copy.deepcopy(model.state_dict())
model.load_state_dict(best_model_wts)

[Solved] NCCL error in: /pytorch/torch/lib/c10d/ProcessGroupNCCL ,unhandled cuda error, NCCLversion 2.7.8

The method used in this paper

The versions of pytorch, cudatoolkit and CUDA driver should be consistent

Problem description

When training the stylegan3 model with multi GPU:

python train.py --outdir=training-runs --cfg=stylegan3-r \
--data=datastes/your_data.zip \
--cfg=stylegan3-r --gpus=4 --batch=32 --gamma=8 --kimg=1800 --snap=50  --tick=2  

Error Messages:

torch.multiprocessing.spawn.ProcessRaisedException:
……
RuntimeError: NCCL error in: /opt/conda/conda-bld/pytorch_1631630841592/work/torch/lib/c10d/ProcessGroupNCCL.cpp:911, unhandled cuda error, NCCL version 2.7.8
ncclUnhandledCudaError: Call to CUDA function failed.

Local Environment
4xTeslaV100 graphics card drivers and CUDA version 11.0

stylegan3 Default Environment

Solution:
Go to the pytorch official website and search the corresponding version of  Cudatookit

conda install pytorch==1.7.0 torchvision==0.8.0 torchaudio==0.7.0 cudatoolkit=11.0 -c pytorch

Tried Method:

Method 1: install nccl (this article is useless)

Method 2: the versions of pytorch, CUDA toolkit and CUDA driver are the same

https://github.com/ultralytics/yolov5/issues/4530

[Solved] torchsummary Error: RuntimeError: Input type (torch.cuda.FloatTensor) and weight type (torch.F

Source code:

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = torchvision.models.resnet18(pretrained=None)
model.fc = nn.Linear(512, 10)
    
summary(model, input_size=[(3, 224, 224)], batch_size=256, device="cuda")

Error Messages: RuntimeError: Input type (torch.cuda.FloatTensor) and weight type (torch.FloatTensor) should be the same
Solution:

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = torchvision.models.resnet18(pretrained=None)
model.fc = nn.Linear(512, 10)

model = model.to(device)  # add this line will be OK

summary(model, input_size=[(3, 224, 224)], batch_size=256, device="cuda")

[Solved] pytorc Error: RuntimeError: one of the variables needed for gradient computation has been modified by an

Error reported by
pytroch when modifying the network:

RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.cuda.FloatTensor [1600, 16, 256]], which is output 0 of CudnnConvolutionBackward, is at version 1; expected version 0 instead

 

Solution 1:

Check all operations of the tensor that is reported as an error. If there is an addition or subtraction operation of x + = m, x = x + m, all operations shall be changed to the following format:

x = x.clone() + m

[Solved] RuntimeError: cuda runtime error (100) : no CUDA-capable device is detected at

When we upload our code to the server to run, we encounter the following problems:

THCudaCheck FAIL file=/pytorch/aten/src/THC/THCGeneral.cpp line=50 error=100 : no CUDA-capable device is detected
Traceback (most recent call last):
  File "HyperAttentionDTI_main.py", line 185, in <module>
    model = AttentionDTI(hp).cuda()
  File "/usr/local/lib/python3.7/site-packages/torch/nn/modules/module.py", line 304, in cuda
    return self._apply(lambda t: t.cuda(device))
  File "/usr/local/lib/python3.7/site-packages/torch/nn/modules/module.py", line 201, in _apply
    module._apply(fn)
  File "/usr/local/lib/python3.7/site-packages/torch/nn/modules/module.py", line 223, in _apply
    param_applied = fn(param)
  File "/usr/local/lib/python3.7/site-packages/torch/nn/modules/module.py", line 304, in <lambda>
    return self._apply(lambda t: t.cuda(device))
  File "/usr/local/lib/python3.7/site-packages/torch/cuda/__init__.py", line 197, in _lazy_init
    torch._C._cuda_init()
RuntimeError: cuda runtime error (100) : no CUDA-capable device is detected at /pytorch/aten/src/THC/THCGeneral.cpp:50

It’s because our graphics card settings are wrong, because we know whether we have a graphics card or not, otherwise we won’t report the wrong graphics card problem

Solution:

Let’s look at the program code we executed and check the CUDA part. I set my graphics card to 6. We don’t have so many graphics cards to use. I checked the location of my graphics card, which is No. 0. Therefore, we set the first line of the following code to 0!

os.environ["CUDA_VISIBLE_DEVICES"] = "6"
if __name__ == "__main__":
    """select seed"""
    SEED = 1234
    random.seed(SEED)
    torch.manual_seed(SEED)
    torch.cuda.manual_seed_all(SEED)
    # torch.backends.cudnn.deterministic = True

[Solved] RuntimeError: Input type (torch.FloatTensor) and weight type (torch.cuda.FloatTensor) should be the

1. Problems

When I was practicing using pytorch today, I prepared to use GPU, and the following errors occurred:

RuntimeError: Input type (torch.FloatTensor) and weight type (torch.cuda.FloatTensor) should be the same

 

2. Code (adjusted and can run correctly)

import torch.optim
import torchvision.datasets

# Preparing the data set
from torch import nn
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter
from time import time

print(torch.cuda.is_available())

train_data = torchvision.datasets.CIFAR10(root="./dataset",
                                          train=True,
                                          transform=torchvision.transforms.ToTensor(),
                                          download=True)

test_data = torchvision.datasets.CIFAR10(root="./dataset",
                                         train=False,
                                         transform=torchvision.transforms.ToTensor(),
                                         download=True)
print("training_set_data_length: %d" % len(train_data))
print("Test set data length: %d" % len(test_data))

# Load data using DataLoader
train_data_loader = DataLoader(train_data, batch_size=64)
test_data_loader = DataLoader(test_data, batch_size=64)


# Build the neural network (in a separate .py file)
class Net(nn.Module):
    def __init__(self) -> None:
        super(Net, self).__init__()
        self.model = nn.Sequential(
            nn.Conv2d(3, 32, 5, 1, 2),
            nn.MaxPool2d(2, 2),
            nn.Conv2d(32, 32, 5, 1, 2),
            nn.MaxPool2d(2, 2),
            nn.Conv2d(32, 64, 5, 1, 2),
            nn.MaxPool2d(2, 2),
            nn.Flatten(),
            nn.Linear(1024, 64),
            nn.Linear(64, 10)

        )

    def forward(self, x):
        x = self.model(x)
        return x


# Create a network model
net = Net()
# Only the model, data, and loss function can run on the GPU
# if torch.cuda.is_available():
#     net = net.cuda()
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
net.to(device)


# Loss function
loss_fn = nn.CrossEntropyLoss()
# if torch.cuda.is_available():
#     loss_fn = loss_fn.cuda()
loss_fn.to(device)

# Optimizer
learning_rate = 1e-2 # 0.01
optimizer = torch.optim.SGD(net.parameters(), lr=learning_rate)

# Set the parameters of the training network
# Record the number of training sessions
total_train_step = 0
# Record the number of training sessions
total_test_step = 0
# Number of training rounds
epoch = 10

start_time = time()
writer = SummaryWriter("./logs/train")
for i in range(epoch):
    print("------Round %d of training ------" % (i + 1))

    # Training steps
    for data in train_data_loader:
        imgs, targets = data
        if torch.cuda.is_available():
            imgs, targets = imgs.cuda(), targets.cuda()
        # imgs.to(device)
        # targets.to(device)
        outputs = net(imgs)
        loss = loss_fn(outputs, targets)

        # Optimizer optimization model
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_train_step += 1
        if total_train_step % 100 == 0:
            end_time = time()
            print(end_time - start_time)
            print("training_step: {}, loss: {}".format(total_train_step, loss.item())) # .item() can convert the tensor type to a number
            writer.add_scalar("train_loss", loss.item(), total_train_step)

    # Test Steps
    total_test_loss = 0
    total_accuracy = 0
    with torch.no_grad(): # Zero out the gradient when testing. No need to adjust the gradient for optimization
        for data in test_data_loader:
            imgs, targets = data
            if torch.cuda.is_available():
                imgs, targets = imgs.cuda(), targets.cuda()
            # imgs.to(device)
            # targets.to(device)
            outputs = net(imgs)
            loss = loss_fn(outputs, targets)

            total_test_loss += loss.item()
            accuracy = (outputs.argmax(1) == targets).sum()
            total_accuracy += accuracy
    print("loss on the overall test set: {}".format(total_test_loss))
    print("Percent correct on the overall test set: {}".format(total_accuracy/len(test_data)))

    writer.add_scalar("test_loss", total_test_loss, total_test_step)
    writer.add_scalar("test_accuracy", total_accuracy/len(test_data), total_test_step)
    total_test_step += 1

    # Save the model
    # torch.save(net.state_dict(), "model_{}.pth".format(i))
    # print("Round {} training model saved".format(i))

writer.close()

3. Solutions

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
for data in train_data_loader:
   imgs, targets = data
    if torch.cuda.is_available():
        imgs, targets = imgs.cuda(), targets.cuda()
    # imgs.to(device)
    # targets.to(device)
    outputs = net(imgs)

In the above code, IMGs and targets cannot use .to(device), so the input type (torch.Floattensor) will appear after use. If it is not GPU type, it can only be used in another way:

if torch.cuda.is_available():
    imgs, targets = imgs.cuda(), targets.cuda()

This can solve the problem that the input and weight types do not match.

4. Reference

https://stackoverflow.com/questions/59013109/runtimeerror-input-type-torch-floattensor-and-weight-type-torch-cuda-floatte

[Solved] Torch Build Module Error: NotImplementedError

It’s probably such an error reporting method. I’ve been using torch for so many years. I first encountered this error NotImplementedError
I’m not using a nightly version

Traceback (most recent call last):

  File "xxxxx\x.py", line 268, in <module>
    print(x(y).shape)

  File "xxxxx\lib\site-packages\torch\nn\modules\module.py", line 889, in _call_impl
    result = self.forward(*input, **kwargs)

  File "xxxxx\x.py", line 259, in forward
    x = self.features(x)

  File "xxxxx\lib\site-packages\torch\nn\modules\module.py", line 889, in _call_impl
    result = self.forward(*input, **kwargs)

  File "xxxxx\lib\site-packages\torch\nn\modules\container.py", line 119, in forward
    input = module(input)

  File "xxxxx\lib\site-packages\torch\nn\modules\module.py", line 889, in _call_impl
    result = self.forward(*input, **kwargs)

  File "xxxxx\lib\site-packages\torch\nn\modules\module.py", line 201, in _forward_unimplemented
    raise NotImplementedError

NotImplementedError

Call self.forward in _call_impl

result = self.forward(*input, **kwargs)

If you inherit nn.Module, and if you don’t implement self.forward, it will

raise NotImplementedError

It turns out that when I use this function, I really don’t have the forward method:

class Hswish(nn.Module):

    def __init__(self, inplace=True):
        super(Hswish, self).__init__()
        self.inplace = inplace

    def __swish(self, x, beta, inplace=True):
        # But this swish is not used by H-swish
        # The reason it's called H-swish is to make the sigmoid hard
        # approximated by Relu6(x+3)/6
        # Reduced computational effort for embedded deployment
        return x * F.sigmoid(beta * x, inplace)

    @staticmethod
    def Hsigmoid(x, inplace=True):
        return F.relu6(x + 3, inplace=inplace)/6

    def foward(self, x):
        return x * self.Hsigmoid(x, self.inplace)

forward Write as foward

torch.max Example (How to Use)

torch.max(input, dim)

pred = torch.max(input, dim)

Returns the maximum value per row (dim = 1) or column (dim = 0).

_, pred = torch.max(input, dim)

Only the position of the maximum value in each row (dim = 1) or column (dim = 0) is returned.

Example:

import torch

# Construct a 5x3 randomly initialized matrix
x = torch.rand(5, 3)
print('input: ', x)
print('-'*10)
y1 = torch.max(x, 1)
print('max by row: ', y1)
print('-'*10)
y2 = torch.max(x, 0)
print('max by col: ', y2)
print('-'*10)
_, y3 = torch.max(x, 1)
print('max index by row: ', y3)
print('-'*10)
_, y4 = torch.max(x, 0)
print('max index by col: ', y4)

Output result:

input:  tensor([[0.5504, 0.3160, 0.2448],
        [0.8694, 0.3295, 0.2085],
        [0.5530, 0.9984, 0.3531],
        [0.2874, 0.1025, 0.9419],
        [0.0867, 0.4234, 0.8334]])
----------
max by row:  torch.return_types.max(
values=tensor([0.5504, 0.8694, 0.9984, 0.9419, 0.8334]),
indices=tensor([0, 0, 1, 2, 2]))
----------
max by col:  torch.return_types.max(
values=tensor([0.8694, 0.9984, 0.9419]),
indices=tensor([1, 2, 3]))
----------
max index by row:  tensor([0, 0, 1, 2, 2])
----------
max index by col:  tensor([1, 2, 3])