Das Ganze Der Trainingsprozess verwendet TensorFlow 2.7.0 (einen automatischen Encoder in Python), um .h5-Modelle zu erstellen, die in .onnx-Modelldateien und dann in .engine-Dateien für die Jetson-Plattform (Jetson AGX Xavier) konvertiert werden CUDA ).
Jetson AGX Xavier-Spezifikationen:
cuda: 11.4.315
cuDNN: 8.6.0
tensorRT: 8.5.2.2
jetpack: 5.1.3
python3 -c "import tensorflow as tf; print('TensorFlow version:', tf.Version)"
TensorFlow-Version: 2.11.0
Auto-Encoder-Trainingsskript in Python (Beispiel):
Code: Select all
input_img = tf.keras.layers.Input(shape=(2000, lines))
# Encoder
x = tf.keras.layers.Conv1D(12, 128, padding='same')(input_img)
x = tf.keras.layers.MaxPooling1D(4)(x) # Downsample: 2000 -> 500
x = tf.keras.layers.Conv1D(12, 64, padding='same')(x)
x = tf.keras.layers.MaxPooling1D(2)(x) # Downsample: 500 -> 250
x = tf.keras.layers.Conv1D(12, 16, padding='same')(x)
x = tf.keras.layers.MaxPooling1D(2)(x) # Downsample: 250 -> 125
# Bottleneck
x = tf.keras.layers.Flatten()(x)
x = tf.keras.layers.Dense(self.__config['MODEL']['ENCODED_STATE_SIZE'])(x)
# Decoder
x = tf.keras.layers.Dense(125 * 12)(x) # Expand to match last encoder feature size
x = tf.keras.layers.Reshape((125, 12))(x)
x = tf.keras.layers.UpSampling1D(2)(x) # Upsample: 125 -> 250
x = tf.keras.layers.Conv1D(12, 16, padding='same')(x)
x = tf.keras.layers.UpSampling1D(2)(x) # Upsample: 250 -> 500
x = tf.keras.layers.Conv1D(12, 64, padding='same')(x)
x = tf.keras.layers.UpSampling1D(4)(x) # Upsample: 500 -> 2000
x = tf.keras.layers.Conv1D(lines, 128, padding='same')(x) # Correct Final Layer
# Model definition
self.__model = tf.keras.models.Model(input_img, x)
Unten sehen Sie zwei Vergleichsdiagramme mit den Inferenzwerten

Gehen Sie nicht davon aus Da die Daten möglicherweise beschädigt sind, habe ich genügend Daten gesammelt, um sie für beide Fälle zu trainieren, und ich habe ihre Gültigkeit überprüft.
Das Verwirrende daran ist, dass Inferenz in Python mit TensorFlow 2.7.0 funktioniert mit GPU, einem Ubuntu Focal x86_64 ... Ich meine, ich habe unterschiedliche Werte zwischen zwei Diagrammen gesehen
In Jetson habe ich ein Py-Skript erstellt, um eine .h5-Modelldatei in zu konvertieren .onnx und dann in das .engine-Format:
Code: Select all
import tf2onnx
import tensorflow as tf
import argparse
import subprocess
def convert_h5_to_onnx(h5_model_path, onnx_model_path):
print("Converting .h5 model to ONNX...")
model = tf.keras.models.load_model(h5_model_path)
model_proto, _ = tf2onnx.convert.from_keras(model, opset=13)
with open(onnx_model_path, "wb") as f:
f.write(model_proto.SerializeToString())
print(f"ONNX model saved at {onnx_model_path}")
def convert_onnx_to_trt(onnx_model_path, engine_model_path, trt_precision_mode):
print("Converting ONNX model to TensorRT Engine...")
fp_precision_flag = '--fp16' if trt_precision_mode.upper() == 'FP16' else ''
trtexec_path = "/usr/src/tensorrt/bin/trtexec"
command = f"{trtexec_path} --onnx={onnx_model_path} --saveEngine={engine_model_path} {fp_precision_flag}"
process = subprocess.run(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
if process.returncode != 0:
print(f"Error in converting to TensorRT engine:\n{process.stderr.decode('utf-8')}")
else:
print(f"TensorRT engine saved at {engine_model_path}")
# Main
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Convert a .h5 model to ONNX and TensorRT engine format")
parser.add_argument("--h5_model_path", type=str, required=True, help="Path to the .h5 model file")
parser.add_argument("--onnx_model_path", type=str, required=True, help="Path to save the converted ONNX model")
parser.add_argument("--engine_model_path", type=str, required=True, help="Path to save the converted TensorRT engine")
parser.add_argument("--trt_precision_mode", type=str, choices=['FP32', 'FP16'], default="FP16", help="Precision mode for TensorRT engine (FP32 or FP16)")
args = parser.parse_args()
convert_h5_to_onnx(args.h5_model_path, args.onnx_model_path)
convert_onnx_to_trt(args.onnx_model_path, args.engine_model_path, args.trt_precision_mode)
Code: Select all
void RunInference(ICudaEngine* engine, IExecutionContext* context, int input_index, int output_index, kiss_fft_cpx* x_fft, kiss_fft_cpx* y_fft, kiss_fft_cpx* z_fft, float* predicted_output, int g_code, const char* clientName) {
int batchSize = 1;
int input_size = batchSize * 2000 * 3 * sizeof(float); // [1, 2000, 3]
int output_size = batchSize * 3 * sizeof(float); // [1, 3]
// Prepare normalized input data and set DC component to zero
float input_data[2000 * 3];
const int MN = 4000;
for (int i = 0; i < 2000; i++) {
input_data[i * 3 + 0] = sqrt(x_fft[i].r * x_fft[i].r + x_fft[i].i * x_fft[i].i) / MN;
input_data[i * 3 + 1] = sqrt(y_fft[i].r * y_fft[i].r + y_fft[i].i * y_fft[i].i) / MN;
input_data[i * 3 + 2] = sqrt(z_fft[i].r * z_fft[i].r + z_fft[i].i * z_fft[i].i) / MN;
}
// Set DC component to zero
input_data[0] = 0; // X-axis
input_data[1] = 0; // Y-axis
input_data[2] = 0; // Z-axis
////Allocate GPU buffers for input and output
void* buffers[2];
write_log(LOG_DEBUG, "RunInference for '%s' - input_index = %d, output_index = %d", clientName, input_index, output_index);
if (cudaMalloc(&buffers[input_index], input_size) != cudaSuccess) {
write_log(LOG_ERROR, "RunInference for '%s' - Failed to allocate memory for input buffer", clientName);
return;
}
if (cudaMalloc(&buffers[output_index], output_size) != cudaSuccess) {
write_log(LOG_ERROR, "RunInference for '%s' - Failed to allocate memory for output buffer", clientName);
cudaFree(buffers[input_index]);
return;
}
if (cudaMemset(buffers[input_index], 0, input_size) != cudaSuccess) {
write_log(LOG_ERROR, "RunInference for '%s' - Failed to memset input buffer to zero", clientName);
return;
}
if (cudaMemset(buffers[output_index], 0, output_size) != cudaSuccess) {
write_log(LOG_ERROR, "RunInference for '%s' - Failed to memset output buffer to zero", clientName);
return;
}
///////////////////
// Copy the input data to the GPU
cudaMemcpy(buffers[input_index], input_data, input_size, cudaMemcpyHostToDevice);
// Launch inference
cudaStream_t stream;
cudaStreamCreate(&stream);
context->enqueueV2(buffers, stream, nullptr);
cudaStreamSynchronize(stream);
// Copy the output data from GPU to CPU
cudaMemcpy(predicted_output, buffers[output_index], output_size, cudaMemcpyDeviceToHost);
// Free GPU memory
cudaFree(buffers[input_index]);
cudaFree(buffers[output_index]);
cudaStreamDestroy(stream);
}
Code: Select all
IRuntime* runtime = createInferRuntime(gLogger);
if (!runtime) {
write_log(LOG_ERROR, "client_handler: Failed to create runtime for client %s", client.ClientName);
return (void*)-1;
}
std::vector engine_data = loadEngine(client.ModelPath, client.ClientName);
ICudaEngine* engine = runtime->deserializeCudaEngine(engine_data.data(), engine_data.size(), nullptr);
if (!engine) {
write_log(LOG_ERROR, "client_handler: Failed to create engine for thread %s", client.ClientName);
return (void*)-1;
}
IExecutionContext* context = engine->createExecutionContext();
if (!context) {
write_log(LOG_ERROR, "client_handler: Failed to create execution context for thread %s", client.ClientName);
engine->destroy();
return (void*)-1;
}
int input_index = engine->getBindingIndex(client.ModelInputBindingName) ;//get from config file
int output_index = engine->getBindingIndex(client.ModelOutputBindingName); //get from config file
RunInference(engine, context, input_index, output_index, x_fft, y_fft, z_fft, predicted_output, client.G_code, client.ClientName);
// Synchronize the GPU to ensure all operations are completed
cudaDeviceSynchronize();
// Check for CUDA errors after synchronization
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
write_log(LOG_ERROR, "CUDA error after synchronization in thread '%s': %s", client.ClientName, cudaGetErrorString(err));
} else {
write_log(LOG_INFO, "GPU synchronized successfully for thread '%s'", client.ClientName);
}
context->destroy();
engine->destroy();
runtime->destroy();
Haben Sie Vorschläge?