CMU-Perceptual-Computing-Lab · bushibushi · Sep 21, 2017 · Sep 21, 2017 · Sep 21, 2017 · Sep 21, 2017
diff --git a/Makefile b/Makefile
@@ -61,13 +61,19 @@ ifeq ($(DEEP_NET), tensorflow)
 # Torch
 else ifeq ($(DEEP_NET), torch)
 	# COMMON_FLAGS += -DUSE_TORCH
+else ifeq ($(DEEP_NET), tensorrt)
+        COMMON_FLAGS += -DUSE_TENSORRT
 # Caffe
 else
 	COMMON_FLAGS += -DUSE_CAFFE
 	LIBRARIES += caffe
 	LDFLAGS += -Wl,-rpath=$(CAFFE_DIR)/lib
 	INCLUDE_DIRS += $(CAFFE_DIR)/include
 	LIBRARY_DIRS += $(CAFFE_DIR)/lib
+
+        ifeq ($(DEEP_NET), tensorrt)
+                COMMON_FLAGS += -DUSE_TENSORRT
+        endif
 endif
 
 ##############################
@@ -145,6 +151,11 @@ ifeq ($(USE_CUDA), 1)
 	LIBRARIES += cudart cublas curand
 endif
 
+# TensorRT
+ifeq ($(DEEP_NET), tensorrt)
+        LIBRARIES += nvinfer nvcaffe_parser
+endif
+
 # LIBRARIES += glog gflags boost_system boost_filesystem m hdf5_hl hdf5 caffe
 LIBRARIES += glog gflags boost_system boost_filesystem m hdf5_hl hdf5
 

diff --git a/examples/tutorial_pose/1_extract_from_image.cpp b/examples/tutorial_pose/1_extract_from_image.cpp
@@ -59,10 +59,29 @@ DEFINE_double(render_threshold,         0.05,           "Only estimated keypoint
 DEFINE_double(alpha_pose,               0.6,            "Blending factor (range 0-1) for the body part rendering. 1 will show it completely, 0 will"
                                                         " hide it. Only valid for GPU rendering.");
 
+typedef std::vector<std::pair<std::string, std::chrono::high_resolution_clock::time_point>> OpTimings;
+
+static OpTimings timings;
+
+static void timeNow(const std::string& label){
+    const auto now = std::chrono::high_resolution_clock::now();
+    const auto timing = std::make_pair(label, now);
+    timings.push_back(timing);
+}
+
+static std::string timeDiffToString(const std::chrono::high_resolution_clock::time_point& t1,
+                                const std::chrono::high_resolution_clock::time_point& t2 ) {
+    return std::to_string((double)std::chrono::duration_cast<std::chrono::duration<double>>(t1 - t2).count() * 1e3) + " ms";
+}
+
+
 int openPoseTutorialPose1()
 {
     op::log("OpenPose Library Tutorial - Example 1.", op::Priority::High);
     // ------------------------- INITIALIZATION -------------------------
+
+    timeNow("Start");
+
     // Step 1 - Set logging level
         // - 0 will output all the logging messages
         // - 255 will output nothing
@@ -104,6 +123,8 @@ int openPoseTutorialPose1()
     poseExtractorCaffe.initializationOnThread();
     poseRenderer.initializationOnThread();
 
+    timeNow("Initialization");
+
     // ------------------------- POSE ESTIMATION AND RENDERING -------------------------
     // Step 1 - Read and load image, error if empty (possibly wrong path)
     // Alternative: cv::imread(FLAGS_image_path, CV_LOAD_IMAGE_COLOR);
@@ -128,13 +149,25 @@ int openPoseTutorialPose1()
     poseRenderer.renderPose(outputArray, poseKeypoints);
     // Step 6 - OpenPose output format to cv::Mat
     auto outputImage = opOutputToCvMat.formatToCvMat(outputArray);
-
+    timeNow("Step 5");
+
     // ------------------------- SHOWING RESULT AND CLOSING -------------------------
     // Step 1 - Show results
     frameDisplayer.displayFrame(outputImage, 0); // Alternative: cv::imshow(outputImage) + cv::waitKey(0)
     // Step 2 - Logging information message
     op::log("Example 1 successfully finished.", op::Priority::High);
     // Return successful message
+
+    const auto totalTimeSec = timeDiffToString(timings.back().second, timings.front().second);
+    const auto message = "Pose estimation successfully finished. Total time: " + totalTimeSec + " seconds.";
+    op::log(message, op::Priority::High);
+
+    for(OpTimings::iterator timing = timings.begin()+1; timing != timings.end(); ++timing) {
+        const auto log_time = (*timing).first + " - " + timeDiffToString((*timing).second, (*(timing-1)).second);
+        op::log(log_time, op::Priority::High);
+    }
+
+
     return 0;
 }
 

diff --git a/examples/tutorial_pose/3_extract_from_image_TensorRT.cpp b/examples/tutorial_pose/3_extract_from_image_TensorRT.cpp
@@ -0,0 +1,174 @@
+// ------------------------- OpenPose Library Tutorial - Pose - Example 3 - Extract from Image with TensorRT -------------------------
+// This first example shows the user how to:
+    // 1. Load an image (`filestream` module)
+    // 2. Extract the pose of that image (`pose` module)
+    // 3. Render the pose on a resized copy of the input image (`pose` module)
+    // 4. Display the rendered pose (`gui` module)
+// In addition to the previous OpenPose modules, we also need to use:
+    // 1. `core` module: for the Array<float> class that the `pose` module needs
+    // 2. `utilities` module: for the error & logging functions, i.e. op::error & op::log respectively
+
+// 3rdparty dependencies
+#include <gflags/gflags.h> // DEFINE_bool, DEFINE_int32, DEFINE_int64, DEFINE_uint64, DEFINE_double, DEFINE_string
+#include <glog/logging.h> // google::InitGoogleLogging
+// OpenPose dependencies
+#include <openpose/core/headers.hpp>
+#include <openpose/filestream/headers.hpp>
+#include <openpose/gui/headers.hpp>
+#include <openpose/pose/headers.hpp>
+#include <openpose/utilities/headers.hpp>
+
+// See all the available parameter options withe the `--help` flag. E.g. `./build/examples/openpose/openpose.bin --help`.
+// Note: This command will show you flags for other unnecessary 3rdparty files. Check only the flags for the OpenPose
+// executable. E.g. for `openpose.bin`, look for `Flags from examples/openpose/openpose.cpp:`.
+// Debugging
+DEFINE_int32(logging_level,             3,              "The logging level. Integer in the range [0, 255]. 0 will output any log() message, while"
+                                                        " 255 will not output any. Current OpenPose library messages are in the range 0-4: 1 for"
+                                                        " low priority messages and 4 for important ones.");
+// Producer
+DEFINE_string(image_path,               "examples/media/COCO_val2014_000000000192.jpg",     "Process the desired image.");
+// OpenPose
+DEFINE_string(model_pose,               "COCO",         "Model to be used. E.g. `COCO` (18 keypoints), `MPI` (15 keypoints, ~10% faster), "
+                                                        "`MPI_4_layers` (15 keypoints, even faster but less accurate).");
+DEFINE_string(model_folder,             "models/",      "Folder path (absolute or relative) where the models (pose, face, ...) are located.");
+DEFINE_string(net_resolution,           "128x96",      "Multiples of 16. If it is increased, the accuracy potentially increases. If it is decreased,"
+                                                        " the speed increases. For maximum speed-accuracy balance, it should keep the closest aspect"
+                                                        " ratio possible to the images or videos to be processed. E.g. the default `128x96` is"
+                                                        " optimal for 16:9 videos, e.g. full HD (1980x1080) and HD (1280x720) videos.");
+DEFINE_string(resolution,               "1280x720",     "The image resolution (display and output). Use \"-1x-1\" to force the program to use the"
+                                                        " default images resolution.");
+DEFINE_int32(num_gpu_start,             0,              "GPU device start number.");
+DEFINE_double(scale_gap,                0.3,            "Scale gap between scales. No effect unless scale_number > 1. Initial scale is always 1."
+                                                        " If you want to change the initial scale, you actually want to multiply the"
+                                                        " `net_resolution` by your desired initial scale.");
+DEFINE_int32(scale_number,              1,              "Number of scales to average.");
+// OpenPose Rendering
+DEFINE_bool(disable_blending,           false,          "If blending is enabled, it will merge the results with the original frame. If disabled, it"
+                                                        " will only display the results on a black background.");
+DEFINE_double(render_threshold,         0.05,           "Only estimated keypoints whose score confidences are higher than this threshold will be"
+                                                        " rendered. Generally, a high threshold (> 0.5) will only render very clear body parts;"
+                                                        " while small thresholds (~0.1) will also output guessed and occluded keypoints, but also"
+                                                        " more false positives (i.e. wrong detections).");
+DEFINE_double(alpha_pose,               0.6,            "Blending factor (range 0-1) for the body part rendering. 1 will show it completely, 0 will"
+                                                        " hide it. Only valid for GPU rendering.");
+
+typedef std::vector<std::pair<std::string, std::chrono::high_resolution_clock::time_point>> OpTimings;
+
+static OpTimings timings;
+
+static void timeNow(const std::string& label){
+    const auto now = std::chrono::high_resolution_clock::now();
+    const auto timing = std::make_pair(label, now);
+    timings.push_back(timing);
+} 
+
+static std::string timeDiffToString(const std::chrono::high_resolution_clock::time_point& t1,
+                                const std::chrono::high_resolution_clock::time_point& t2 ) {
+    return std::to_string((double)std::chrono::duration_cast<std::chrono::duration<double>>(t1 - t2).count() * 1e3) + " ms";
+}
+
+int openPoseTutorialPose3()
+{
+#ifdef USE_TENSORRT
+    op::log("Starting pose estimation.", op::Priority::High);
+
+    timeNow("Start");
+
+    op::log("OpenPose Library Tutorial - Pose Example 3.", op::Priority::High);
+    // ------------------------- INITIALIZATION -------------------------
+    // Step 1 - Set logging level
+        // - 0 will output all the logging messages
+        // - 255 will output nothing
+    op::check(0 <= FLAGS_logging_level && FLAGS_logging_level <= 255, "Wrong logging_level value.", __LINE__, __FUNCTION__, __FILE__);
+    op::ConfigureLog::setPriorityThreshold((op::Priority)FLAGS_logging_level);
+    op::log("", op::Priority::Low, __LINE__, __FUNCTION__, __FILE__);
+    // Step 2 - Read Google flags (user defined configuration)
+    // outputSize
+    const auto outputSize = op::flagsToPoint(FLAGS_resolution, "1280x720");
+    // netInputSize
+    const auto netInputSize = op::flagsToPoint(FLAGS_net_resolution, "128x96");
+    // netOutputSize
+    const auto netOutputSize = netInputSize;
+    // poseModel
+    const auto poseModel = op::flagsToPoseModel(FLAGS_model_pose);
+    // Check no contradictory flags enabled
+    if (FLAGS_alpha_pose < 0. || FLAGS_alpha_pose > 1.)
+        op::error("Alpha value for blending must be in the range [0,1].", __LINE__, __FUNCTION__, __FILE__);
+    if (FLAGS_scale_gap <= 0. && FLAGS_scale_number > 1)
+        op::error("Incompatible flag configuration: scale_gap must be greater than 0 or scale_number = 1.", __LINE__, __FUNCTION__, __FILE__);
+    // Logging
+    op::log("", op::Priority::Low, __LINE__, __FUNCTION__, __FILE__);
+    // Step 3 - Initialize all required classes
+    op::CvMatToOpInput cvMatToOpInput{netInputSize, FLAGS_scale_number, (float)FLAGS_scale_gap};
+    op::CvMatToOpOutput cvMatToOpOutput{outputSize};
+    op::PoseExtractorTensorRT poseExtractorTensorRT{netInputSize, netOutputSize, outputSize, FLAGS_scale_number, poseModel,
+                                              FLAGS_model_folder, FLAGS_num_gpu_start};
+    op::PoseRenderer poseRenderer{netOutputSize, outputSize, poseModel, nullptr, (float)FLAGS_render_threshold,
+                                  !FLAGS_disable_blending, (float)FLAGS_alpha_pose};
+    op::OpOutputToCvMat opOutputToCvMat{outputSize};
+    const op::Point<int> windowedSize = outputSize;
+    op::FrameDisplayer frameDisplayer{windowedSize, "OpenPose Tutorial - Example 1"};
+    // Step 4 - Initialize resources on desired thread (in this case single thread, i.e. we init resources here)
+    poseExtractorTensorRT.initializationOnThread();
+    poseRenderer.initializationOnThread();
+
+    timeNow("Initialization");
+
+    // ------------------------- POSE ESTIMATION AND RENDERING -------------------------
+    // Step 1 - Read and load image, error if empty (possibly wrong path)
+    cv::Mat inputImage = op::loadImage(FLAGS_image_path, CV_LOAD_IMAGE_COLOR); // Alternative: cv::imread(FLAGS_image_path, CV_LOAD_IMAGE_COLOR);
+    if(inputImage.empty())
+        op::error("Could not open or find the image: " + FLAGS_image_path, __LINE__, __FUNCTION__, __FILE__);
+    timeNow("Step 1");
+    // Step 2 - Format input image to OpenPose input and output formats
+    op::Array<float> netInputArray;
+    std::vector<float> scaleRatios;
+    std::tie(netInputArray, scaleRatios) = cvMatToOpInput.format(inputImage);
+    double scaleInputToOutput;
+    op::Array<float> outputArray;
+    std::tie(scaleInputToOutput, outputArray) = cvMatToOpOutput.format(inputImage);
+    timeNow("Step 2");
+    // Step 3 - Estimate poseKeypoints
+    poseExtractorTensorRT.forwardPass(netInputArray, {inputImage.cols, inputImage.rows}, scaleRatios);
+    const auto poseKeypoints = poseExtractorTensorRT.getPoseKeypoints();
+    timeNow("Step 3");
+    // Step 4 - Render poseKeypoints
+    poseRenderer.renderPose(outputArray, poseKeypoints);
+    timeNow("Step 4");
+    // Step 5 - OpenPose output format to cv::Mat
+    auto outputImage = opOutputToCvMat.formatToCvMat(outputArray);
+    timeNow("Step 5");
+
+    // ------------------------- SHOWING RESULT AND CLOSING -------------------------
+    // Step 1 - Show results
+    frameDisplayer.displayFrame(outputImage, 0); // Alternative: cv::imshow(outputImage) + cv::waitKey(0)
+    // Step 2 - Logging information message
+    op::log("Example 1 successfully finished.", op::Priority::High);
+
+    const auto totalTimeSec = timeDiffToString(timings.back().second, timings.front().second);
+    const auto message = "Pose estimation successfully finished. Total time: " + totalTimeSec + " seconds.";
+    op::log(message, op::Priority::High);
+
+    for(OpTimings::iterator timing = timings.begin()+1; timing != timings.end(); ++timing) {
+        const auto log_time = (*timing).first + " - " + timeDiffToString((*timing).second, (*(timing-1)).second);
+        op::log(log_time, op::Priority::High);
+    }
+
+#endif // USE_TENSORRT
+
+    // Return successful message
+    return 0;
+}
+
+int main(int argc, char *argv[])
+{
+    // Initializing google logging (Caffe uses it for logging)
+    google::InitGoogleLogging("openPoseTutorialPose3");
+
+    // Parsing command line flags
+    gflags::ParseCommandLineFlags(&argc, &argv, true);
+
+    // Running openPoseTutorialPose1
+    return openPoseTutorialPose3();
+}
+
diff --git a/include/openpose/core/netTensorRT.hpp b/include/openpose/core/netTensorRT.hpp
@@ -0,0 +1,60 @@
+#ifdef USE_TENSORRT
+#ifndef OPENPOSE_CORE_NET_TENSORRT_HPP
+#define OPENPOSE_CORE_NET_TENSORRT_HPP
+
+#include <caffe/net.hpp>
+#include <openpose/core/common.hpp>
+#include <openpose/core/net.hpp>
+
+#include "NvInfer.h"
+
+namespace op
+{
+    class OP_API NetTensorRT : public Net
+    {
+    public:
+        NetTensorRT(const std::array<int, 4>& netInputSize4D, const std::string& caffeProto, const std::string& caffeTrainedModel, const int gpuId = 0,
+                 const std::string& lastBlobName = "net_output");
+
+        virtual ~NetTensorRT();
+
+        void initializationOnThread();
+
+        // Alternative a) getInputDataCpuPtr or getInputDataGpuPtr + forwardPass
+        float* getInputDataCpuPtr() const;
+
+        float* getInputDataGpuPtr() const;
+
+        // Alternative b)
+        void forwardPass(const float* const inputNetData = nullptr) const;
+
+        boost::shared_ptr<caffe::Blob<float>> getOutputBlob() const;
+
+    private:
+        // Init with constructor
+        const int mGpuId;
+        const std::array<int, 4> mNetInputSize4D;
+        std::array<int, 4> mNetOutputSize4D;
+        const unsigned long mNetInputMemory;
+        const std::string mCaffeProto;
+        const std::string mCaffeTrainedModel;
+        const std::string mLastBlobName;
+        // Init with thread
+
+        boost::shared_ptr<caffe::Blob<float>> spInputBlob;
+        boost::shared_ptr<caffe::Blob<float>> spOutputBlob;
+
+        // TensorRT stuff
+        nvinfer1::ICudaEngine* cudaEngine;
+        nvinfer1::IExecutionContext* cudaContext;
+        nvinfer1::ICudaEngine* caffeToGIEModel();
+        nvinfer1::ICudaEngine* createEngine();
+        cudaStream_t stream;
+        cudaEvent_t start, end;
+
+        DELETE_COPY(NetTensorRT);
+    };
+}
+
+#endif // OPENPOSE_CORE_NET_TENSORRT_HPP
+#endif // USE_TENSORRT
diff --git a/include/openpose/pose/headers.hpp b/include/openpose/pose/headers.hpp
@@ -15,4 +15,8 @@
 #include <openpose/pose/wPoseExtractor.hpp>
 #include <openpose/pose/wPoseRenderer.hpp>
 
+#ifdef USE_TENSORRT
+	#include <openpose/pose/poseExtractorTensorRT.hpp>
+#endif // USE_TENSORRT
+
 #endif // OPENPOSE_POSE_HEADERS_HPP
diff --git a/include/openpose/pose/poseExtractorTensorRT.hpp b/include/openpose/pose/poseExtractorTensorRT.hpp
@@ -0,0 +1,52 @@
+#ifdef USE_TENSORRT
+#ifndef OPENPOSE_POSE_POSE_EXTRACTOR_TENSORRT_HPP
+#define OPENPOSE_POSE_POSE_EXTRACTOR_TENSORRT_HPP
+
+#include <caffe/blob.hpp>
+#include <openpose/core/common.hpp>
+#include <openpose/core/net.hpp>
+#include <openpose/core/nmsCaffe.hpp>
+#include <openpose/core/resizeAndMergeCaffe.hpp>
+#include <openpose/pose/bodyPartConnectorCaffe.hpp>
+#include <openpose/pose/enumClasses.hpp>
+#include <openpose/pose/poseExtractor.hpp>
+
+namespace op
+{
+    class OP_API PoseExtractorTensorRT : public PoseExtractor
+    {
+    public:
+        PoseExtractorTensorRT(const Point<int>& netInputSize, const Point<int>& netOutputSize, const Point<int>& outputSize, const int scaleNumber,
+                           const PoseModel poseModel, const std::string& modelFolder, const int gpuId, const std::vector<HeatMapType>& heatMapTypes = {},
+                           const ScaleMode heatMapScale = ScaleMode::ZeroToOne);
+
+        virtual ~PoseExtractorTensorRT();
+
+        void netInitializationOnThread();
+
+        void forwardPass(const Array<float>& inputNetData, const Point<int>& inputDataSize, const std::vector<float>& scaleRatios = {1.f});
+
+        const float* getHeatMapCpuConstPtr() const;
+
+        const float* getHeatMapGpuConstPtr() const;
+
+        const float* getPoseGpuConstPtr() const;
+
+    private:
+        const float mResizeScale;
+        std::shared_ptr<Net> spNet;
+        std::shared_ptr<ResizeAndMergeCaffe<float>> spResizeAndMergeTensorRT;
+        std::shared_ptr<NmsCaffe<float>> spNmsTensorRT;
+        std::shared_ptr<BodyPartConnectorCaffe<float>> spBodyPartConnectorTensorRT;
+        // Init with thread
+        boost::shared_ptr<caffe::Blob<float>> spTensorRTNetOutputBlob;
+        std::shared_ptr<caffe::Blob<float>> spHeatMapsBlob;
+        std::shared_ptr<caffe::Blob<float>> spPeaksBlob;
+        std::shared_ptr<caffe::Blob<float>> spPoseBlob;
+
+        DELETE_COPY(PoseExtractorTensorRT);
+    };
+}
+
+#endif // OPENPOSE_POSE_POSE_EXTRACTOR_TENSORRT_HPP
+#endif // USE_TENSORRT