TensorRT (C/C++) weist auf seltsames Verhalten auf Jetson AGX Xavier hin

Guest · Post by **Guest** » 15 Jan 2025, 17:15

Ich habe zwei unterschiedliche Modelle für zwei Anwendungsfälle entwickelt, um einige Vibrationsmuster zu analysieren: eines beim Einschalten des Systems und das zweite beim Herunterfahren des Systems (also werden keine Vibrationen erkannt)
Das Ganze Der Trainingsprozess verwendet TensorFlow 2.7.0 (einen automatischen Encoder in Python), um .h5-Modelle zu erstellen, die in .onnx-Modelldateien und dann in .engine-Dateien für die Jetson-Plattform (Jetson AGX Xavier) konvertiert werden CUDA ).
Jetson AGX Xavier-Spezifikationen:
cuda: 11.4.315
cuDNN: 8.6.0
tensorRT: 8.5.2.2
jetpack: 5.1.3
python3 -c "import tensorflow as tf; print('TensorFlow version:', tf.Version)"
TensorFlow-Version: 2.11.0
Auto-Encoder-Trainingsskript in Python (Beispiel):

Code: Select all

input_img = tf.keras.layers.Input(shape=(2000, lines))

# Encoder
x = tf.keras.layers.Conv1D(12, 128, padding='same')(input_img)
x = tf.keras.layers.MaxPooling1D(4)(x)  # Downsample: 2000 -> 500

x = tf.keras.layers.Conv1D(12, 64, padding='same')(x)
x = tf.keras.layers.MaxPooling1D(2)(x)  # Downsample: 500 -> 250

x = tf.keras.layers.Conv1D(12, 16, padding='same')(x)
x = tf.keras.layers.MaxPooling1D(2)(x)  # Downsample: 250 -> 125

# Bottleneck
x = tf.keras.layers.Flatten()(x)
x = tf.keras.layers.Dense(self.__config['MODEL']['ENCODED_STATE_SIZE'])(x)

# Decoder
x = tf.keras.layers.Dense(125 * 12)(x)  # Expand to match last encoder feature size
x = tf.keras.layers.Reshape((125, 12))(x)

x = tf.keras.layers.UpSampling1D(2)(x)  # Upsample: 125 -> 250
x = tf.keras.layers.Conv1D(12, 16, padding='same')(x)

x = tf.keras.layers.UpSampling1D(2)(x)  # Upsample: 250 -> 500
x = tf.keras.layers.Conv1D(12, 64, padding='same')(x)

x = tf.keras.layers.UpSampling1D(4)(x)  # Upsample: 500 -> 2000
x = tf.keras.layers.Conv1D(lines, 128, padding='same')(x)  # Correct Final Layer

# Model definition
self.__model = tf.keras.models.Model(input_img, x)

Es spielt keine Rolle, welches Modell ich verwende, die Inferenzergebniswerte sind die SELBEN, genau dieselben Werte, als ob das neuronale Netzwerk nichts gelernt hätte......
Unten sehen Sie zwei Vergleichsdiagramme mit den Inferenzwerten

Gehen Sie nicht davon aus Da die Daten möglicherweise beschädigt sind, habe ich genügend Daten gesammelt, um sie für beide Fälle zu trainieren, und ich habe ihre Gültigkeit überprüft.
Das Verwirrende daran ist, dass Inferenz in Python mit TensorFlow 2.7.0 funktioniert mit GPU, einem Ubuntu Focal x86_64 ... Ich meine, ich habe unterschiedliche Werte zwischen zwei Diagrammen gesehen
In Jetson habe ich ein Py-Skript erstellt, um eine .h5-Modelldatei in zu konvertieren .onnx und dann in das .engine-Format:

Code: Select all

import tf2onnx
import tensorflow as tf
import argparse
import subprocess

def convert_h5_to_onnx(h5_model_path, onnx_model_path):
print("Converting .h5 model to ONNX...")

model = tf.keras.models.load_model(h5_model_path)

model_proto, _ = tf2onnx.convert.from_keras(model, opset=13)

with open(onnx_model_path, "wb") as f:
f.write(model_proto.SerializeToString())

print(f"ONNX model saved at {onnx_model_path}")

def convert_onnx_to_trt(onnx_model_path, engine_model_path, trt_precision_mode):
print("Converting ONNX model to TensorRT Engine...")

fp_precision_flag = '--fp16' if trt_precision_mode.upper() == 'FP16' else ''

trtexec_path = "/usr/src/tensorrt/bin/trtexec"

command = f"{trtexec_path} --onnx={onnx_model_path} --saveEngine={engine_model_path} {fp_precision_flag}"

process = subprocess.run(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)

if process.returncode != 0:
print(f"Error in converting to TensorRT engine:\n{process.stderr.decode('utf-8')}")
else:
print(f"TensorRT engine saved at {engine_model_path}")

# Main
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Convert a .h5 model to ONNX and TensorRT engine format")
parser.add_argument("--h5_model_path", type=str, required=True, help="Path to the .h5 model file")
parser.add_argument("--onnx_model_path", type=str, required=True, help="Path to save the converted ONNX model")
parser.add_argument("--engine_model_path", type=str, required=True, help="Path to save the converted TensorRT engine")
parser.add_argument("--trt_precision_mode", type=str, choices=['FP32', 'FP16'], default="FP16", help="Precision mode for TensorRT engine (FP32 or FP16)")

args = parser.parse_args()

convert_h5_to_onnx(args.h5_model_path, args.onnx_model_path)

convert_onnx_to_trt(args.onnx_model_path, args.engine_model_path, args.trt_precision_mode)

„RunInference“ ist meine C/C++-Inferenzfunktion mit TensorRT (als Eingabedaten habe ich FFTs der Rohwerte verwendet)

Code: Select all

void RunInference(ICudaEngine* engine, IExecutionContext* context, int input_index, int output_index, kiss_fft_cpx* x_fft, kiss_fft_cpx* y_fft, kiss_fft_cpx* z_fft, float* predicted_output, int g_code, const char* clientName) {

int batchSize = 1;
int input_size = batchSize * 2000 * 3 * sizeof(float);  // [1, 2000, 3]
int output_size = batchSize * 3 * sizeof(float);        // [1, 3]

// Prepare normalized input data and set DC component to zero
float input_data[2000 * 3];
const int MN = 4000;

for (int i = 0; i < 2000; i++) {
input_data[i * 3 + 0] = sqrt(x_fft[i].r * x_fft[i].r + x_fft[i].i * x_fft[i].i) / MN;
input_data[i * 3 + 1] = sqrt(y_fft[i].r * y_fft[i].r + y_fft[i].i * y_fft[i].i) / MN;
input_data[i * 3 + 2] = sqrt(z_fft[i].r * z_fft[i].r + z_fft[i].i * z_fft[i].i) / MN;
}

// Set DC component to zero
input_data[0] = 0;  // X-axis
input_data[1] = 0;  // Y-axis
input_data[2] = 0;   // Z-axis

////Allocate GPU buffers for input and output
void* buffers[2];

write_log(LOG_DEBUG, "RunInference for '%s' - input_index = %d, output_index = %d", clientName, input_index, output_index);

if (cudaMalloc(&buffers[input_index], input_size) != cudaSuccess) {
write_log(LOG_ERROR, "RunInference for '%s' - Failed to allocate memory for input buffer", clientName);
return;
}
if (cudaMalloc(&buffers[output_index], output_size) != cudaSuccess) {
write_log(LOG_ERROR, "RunInference for '%s' - Failed to allocate memory for output buffer", clientName);
cudaFree(buffers[input_index]);
return;
}

if (cudaMemset(buffers[input_index], 0, input_size) != cudaSuccess) {
write_log(LOG_ERROR, "RunInference for '%s' - Failed to memset input buffer to zero", clientName);
return;
}
if (cudaMemset(buffers[output_index], 0, output_size) != cudaSuccess) {
write_log(LOG_ERROR, "RunInference for '%s' - Failed to memset output buffer to zero", clientName);
return;
}
///////////////////

// Copy the input data to the GPU
cudaMemcpy(buffers[input_index], input_data, input_size, cudaMemcpyHostToDevice);

// Launch inference
cudaStream_t stream;
cudaStreamCreate(&stream);
context->enqueueV2(buffers, stream, nullptr);
cudaStreamSynchronize(stream);

// Copy the output data from GPU to CPU
cudaMemcpy(predicted_output, buffers[output_index], output_size, cudaMemcpyDeviceToHost);

// Free GPU memory
cudaFree(buffers[input_index]);
cudaFree(buffers[output_index]);
cudaStreamDestroy(stream);
}

So lade ich ein Modell in die App und rufe die Inferenzfunktion auf:

Code: Select all

    IRuntime* runtime = createInferRuntime(gLogger);
if (!runtime) {
write_log(LOG_ERROR, "client_handler: Failed to create runtime for client %s", client.ClientName);
return (void*)-1;
}

std::vector engine_data = loadEngine(client.ModelPath, client.ClientName);

ICudaEngine* engine = runtime->deserializeCudaEngine(engine_data.data(), engine_data.size(), nullptr);
if (!engine) {
write_log(LOG_ERROR, "client_handler: Failed to create engine for thread %s", client.ClientName);

return (void*)-1;
}

IExecutionContext* context = engine->createExecutionContext();
if (!context) {
write_log(LOG_ERROR, "client_handler: Failed to create execution context for thread %s", client.ClientName);
engine->destroy();
return (void*)-1;
}

int input_index = engine->getBindingIndex(client.ModelInputBindingName) ;//get from config file
int output_index = engine->getBindingIndex(client.ModelOutputBindingName); //get from config file

RunInference(engine, context, input_index, output_index, x_fft, y_fft, z_fft, predicted_output, client.G_code, client.ClientName);

// Synchronize the GPU to ensure all operations are completed
cudaDeviceSynchronize();

// Check for CUDA errors after synchronization
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
write_log(LOG_ERROR, "CUDA error after synchronization in thread '%s': %s", client.ClientName, cudaGetErrorString(err));
} else {
write_log(LOG_INFO, "GPU synchronized successfully for thread '%s'", client.ClientName);
}

context->destroy();
engine->destroy();
runtime->destroy();

Ich möchte darauf hinweisen, dass die Vibrationen von der Anwendung erkannt werden, verstehe aber nicht, warum sich der Wertebereich je nach trainiertem Modell aus den beiden Szenarien nicht ändert. Ich vermute, dass das Problem bei der Modellkonvertierung oder dem Inferenzprozess/-funktion in TensorRT unter Verwendung von C/C++ liegt.
Haben Sie Vorschläge?

TensorRT (C/C++) weist auf seltsames Verhalten auf Jetson AGX Xavier hin

TensorRT (C/C++) weist auf seltsames Verhalten auf Jetson AGX Xavier hin ⇐ C++

Quick Reply