diff --git a/docs/_static/theme_overrides.css b/docs/_static/theme_overrides.css
new file mode 100644
index 0000000000000..b64e4802fd683
--- /dev/null
+++ b/docs/_static/theme_overrides.css
@@ -0,0 +1,7 @@
+code, .rst-content tt, .rst-content code {
+    white-space: normal;
+}
+
+.rst-content .section ol p, .rst-content .section ul p {
+    margin-bottom: 2px;
+}
\ No newline at end of file
diff --git a/docs/api_reference/cxx_api_doc.md b/docs/api_reference/cxx_api_doc.md
deleted file mode 100644
index 83494168ff46e..0000000000000
--- a/docs/api_reference/cxx_api_doc.md
+++ /dev/null
@@ -1 +0,0 @@
-# C++ API 文档
diff --git a/docs/api_reference/cxx_api_doc/Config/CPUConfig.md b/docs/api_reference/cxx_api_doc/Config/CPUConfig.md
new file mode 100644
index 0000000000000..b3f976f78d54a
--- /dev/null
+++ b/docs/api_reference/cxx_api_doc/Config/CPUConfig.md
@@ -0,0 +1,119 @@
+# 使用 CPU 进行预测
+
+**注意：**
+1. 在 CPU 型号允许的情况下，进行预测库下载或编译试尽量使用带 AVX 和 MKL 的版本
+2. 可以尝试使用 Intel 的 MKLDNN 进行 CPU 预测加速，默认 CPU 不启用 MKLDNN
+3. 在 CPU 可用核心数足够时，可以通过设置 `SetCpuMathLibraryNumThreads` 将线程数调高一些，默认线程数为 1
+
+## CPU 设置
+
+API定义如下：
+
+```c++
+// 设置 CPU Blas 库计算线程数
+// 参数：cpu_math_library_num_threads - blas库计算线程数
+// 返回：None
+void SetCpuMathLibraryNumThreads(int cpu_math_library_num_threads);
+
+// 获取 CPU Blas 库计算线程数
+// 参数：None
+// 返回：int - cpu blas库计算线程数。
+int cpu_math_library_num_threads() const;
+```
+
+代码示例：
+
+```c++
+// 创建默认 Config 对象
+paddle_infer::Config config();
+
+// 设置 CPU Blas 库线程数为 10
+config.SetCpuMathLibraryNumThreads(10);
+
+// 通过 API 获取 CPU 信息
+int num_thread = config.cpu_math_library_num_threads();
+std::cout << "CPU blas thread number is: " << num_thread << std::endl; // 10
+```
+
+## MKLDNN 设置
+
+**注意：** 
+1. 启用 MKLDNN 的前提为已经使用 CPU 进行预测，否则启用 MKLDNN 无法生效
+2. 启用 MKLDNN BF16 要求 CPU 型号可以支持 AVX512，否则无法启用 MKLDNN BF16
+3. `SetMkldnnCacheCapacity` 请参考 <a class="reference external" href="https://github.com/PaddlePaddle/FluidDoc/blob/develop/doc/fluid/design/mkldnn/caching/caching.md">MKLDNN cache设计文档</a>
+
+API定义如下：
+
+```c++
+// 启用 MKLDNN 进行预测加速
+// 参数：None
+// 返回：None
+void EnableMKLDNN();
+
+// 判断是否启用 MKLDNN 
+// 参数：None
+// 返回：bool - 是否启用 MKLDNN
+bool mkldnn_enabled() const;
+
+// 设置 MKLDNN 针对不同输入 shape 的 cache 容量大小
+// 参数：int - cache 容量大小
+// 返回：None
+void SetMkldnnCacheCapacity(int capacity);
+
+// 指定使用 MKLDNN 加速的 OP 列表
+// 参数：std::unordered_set<std::string> - 使用 MKLDNN 加速的 OP 列表
+// 返回：None
+void SetMKLDNNOp(std::unordered_set<std::string> op_list);
+
+// 启用 MKLDNN BFLOAT16
+// 参数：None
+// 返回：None
+void EnableMkldnnBfloat16();
+
+// 判断是否启用 MKLDNN BFLOAT16
+// 参数：None
+// 返回：bool - 是否启用 MKLDNN BFLOAT16
+bool mkldnn_bfloat16_enabled() const;
+
+// 指定使用 MKLDNN BFLOAT16 加速的 OP 列表
+// 参数：std::unordered_set<std::string> - 使用 MKLDNN BFLOAT16 加速的 OP 列表
+// 返回：None
+void SetBfloat16Op(std::unordered_set<std::string> op_list);
+```
+
+代码示例 (1)：使用 MKLDNN 进行预测
+
+```c++
+// 创建 Config 对象
+paddle_infer::Config config(FLAGS_infer_model + "/mobilenet");
+
+// 启用 MKLDNN 进行预测
+config.EnableMKLDNN();
+// 通过 API 获取 MKLDNN 启用结果 - true
+std::cout << "Enable MKLDNN is: " << config.mkldnn_enabled() << std::endl;
+
+// 设置 MKLDNN 的 cache 容量大小
+config.SetMkldnnCacheCapacity(1);
+
+// 设置启用 MKLDNN 进行加速的 OP 列表
+std::unordered_set<std::string> op_list = {"softmax", "elementwise_add", "relu"};
+config.SetMKLDNNOp(op_list);
+```
+
+代码示例 (2)：使用 MKLDNN BFLOAT16 进行预测
+
+```c++
+// 创建 Config 对象
+paddle_infer::Config config(FLAGS_infer_model + "/mobilenet");
+
+// 启用 MKLDNN 进行预测
+config.EnableMKLDNN();
+
+// 启用 MKLDNN BFLOAT16 进行预测
+config.EnableMkldnnBfloat16();
+// 设置启用 MKLDNN BFLOAT16 的 OP 列表
+config.SetBfloat16Op({"conv2d"});
+
+// 通过 API 获取 MKLDNN BFLOAT16 启用结果 - true
+std::cout << "Enable MKLDNN BF16 is: " << config.mkldnn_bfloat16_enabled() << std::endl;
+```
diff --git a/docs/api_reference/cxx_api_doc/Config/ConfigClass.md b/docs/api_reference/cxx_api_doc/Config/ConfigClass.md
new file mode 100644
index 0000000000000..661934d598784
--- /dev/null
+++ b/docs/api_reference/cxx_api_doc/Config/ConfigClass.md
@@ -0,0 +1,63 @@
+# Config 构造函数
+
+`Config` 类为用于配置构建 `Predictor` 对象的配置信息，如模型路径、是否开启gpu等等。
+
+构造函数定义如下：
+
+```c++
+// 创建 Config 对象，默认构造函数
+Config();
+
+// 创建 Config 对象，输入为其他 Config 对象
+Config(const Config& other);
+
+// 创建 Config 对象，输入为非 Combine 模型的文件夹路径
+Config(const std::string& model_dir);
+
+// 创建 Config 对象，输入分别为 Combine 模型的模型文件路径和参数文件路径
+Config(const std::string& prog_file, const std::string& params_file);
+```
+
+代码示例 (1)：默认构造函数，通过API加载预测模型 - 非Combined模型
+
+```c++
+// 字符串 model_dir 为非 Combine 模型文件夹路径
+std::string model_dir = "../assets/models/mobilenet_v1";
+
+// 创建默认 Config 对象
+paddle_infer::Config config();
+
+// 通过 API 设置模型文件夹路径
+config.SetModel(model_dir);
+
+// 根据 Config 对象创建预测器对象
+auto predictor = paddle_infer::CreatePredictor(config);
+```
+
+代码示例 (2)：通过构造函数加载预测模型 - 非Combined模型
+
+```c++
+// 字符串 model_dir 为非 Combine 模型文件夹路径
+std::string model_dir = "../assets/models/mobilenet_v1";
+
+// 根据非 Combine 模型的文件夹路径构造 Config 对象
+paddle_infer::Config config(model_dir);
+
+// 根据 Config 对象创建预测器对象
+auto predictor = paddle_infer::CreatePredictor(config);
+```
+
+代码示例 (3)：通过构造函数加载预测模型 - Combined模型
+
+```c++
+// 字符串 prog_file 为 Combine 模型文件所在路径
+std::string prog_file = "../assets/models/mobilenet_v1/__model__";
+// 字符串 params_file 为 Combine 模型参数文件所在路径
+std::string params_file = "../assets/models/mobilenet_v1/__params__";
+
+// 根据 Combine 模型的模型文件和参数文件构造 Config 对象
+paddle_infer::Config config(prog_file, params_file);
+
+// 根据 Config 对象创建预测器对象
+auto predictor = paddle_infer::CreatePredictor(config);
+```
diff --git a/docs/api_reference/cxx_api_doc/Config/GPUConfig.md b/docs/api_reference/cxx_api_doc/Config/GPUConfig.md
new file mode 100644
index 0000000000000..e2927a6b96236
--- /dev/null
+++ b/docs/api_reference/cxx_api_doc/Config/GPUConfig.md
@@ -0,0 +1,300 @@
+# 使用 GPU 进行预测
+
+**注意：**
+1. Config 默认使用 CPU 进行预测，需要通过 `EnableUseGpu` 来启用 GPU 预测
+2. 可以尝试启用 CUDNN 和 TensorRT 进行 GPU 预测加速
+
+## GPU 设置
+
+API定义如下：
+
+```c++
+// 启用 GPU 进行预测
+// 参数：memory_pool_init_size_mb - 初始化分配的gpu显存，以MB为单位
+//      device_id - 设备id
+// 返回：None
+void EnableUseGpu(uint64_t memory_pool_init_size_mb, int device_id = 0);
+
+// 禁用 GPU 进行预测
+// 参数：None
+// 返回：None
+void DisableGpu();
+
+// 判断是否启用 GPU 
+// 参数：None
+// 返回：bool - 是否启用 GPU 
+bool use_gpu() const;
+
+// 获取 GPU 的device id
+// 参数：None
+// 返回：int -  GPU 的device id
+int gpu_device_id() const;
+
+// 获取 GPU 的初始显存大小
+// 参数：None
+// 返回：int -  GPU 的初始的显存大小
+int memory_pool_init_size_mb() const;
+
+// 初始化显存占总显存的百分比
+// 参数：None
+// 返回：float - 初始的显存占总显存的百分比
+float fraction_of_gpu_memory_for_pool() const;
+
+// 开启线程流，目前的行为是为每一个线程绑定一个流，在将来该行为可能改变
+// 参数：None
+// 返回：None
+void EnableGpuMultiStream();
+
+// 判断是否开启线程流
+// 参数：None
+// 返回：bool - 是否是否开启线程流
+bool thread_local_stream_enabled() const;
+```
+
+GPU设置代码示例：
+
+```c++
+// 创建默认 Config 对象
+paddle_infer::Config config;
+
+// 启用 GPU 进行预测 - 初始化 GPU 显存 100M, Deivce_ID 为 0
+config.EnableUseGpu(100, 0);
+// 通过 API 获取 GPU 信息
+std::cout << "Use GPU is: " << config.use_gpu() << std::endl; // true
+std::cout << "Init mem size is: " << config.memory_pool_init_size_mb() << std::endl;
+std::cout << "Init mem frac is: " << config.fraction_of_gpu_memory_for_pool() << std::endl;
+std::cout << "GPU device id is: " << config.gpu_device_id() << std::endl;
+
+// 禁用 GPU 进行预测
+config.DisableGpu();
+// 通过 API 获取 GPU 信息
+std::cout << "Use GPU is: " << config.use_gpu() << std::endl; // false
+```
+
+开启多线程流代码示例：
+
+```c++
+// 自定义 Barrier 类，用于线程间同步
+class Barrier {
+ public:
+  explicit Barrier(std::size_t count) : _count(count) {}
+  void Wait() {
+    std::unique_lock<std::mutex> lock(_mutex);
+    if (--_count) {
+      _cv.wait(lock, [this] { return _count == 0; });
+    } else {
+      _cv.notify_all();
+    }
+  }
+ private:
+  std::mutex _mutex;
+  std::condition_variable _cv;
+  std::size_t _count;
+};
+
+int test_main(const paddle_infer::Config& config, Barrier* barrier = nullptr) {
+  static std::mutex mutex;
+  // 创建 Predictor 对象
+  std::shared_ptr<paddle_infer::Predictor> predictor;
+  {
+    std::unique_lock<std::mutex> lock(mutex);
+    predictor = std::move(paddle_infer::CreatePredictor(config));
+  }
+  if (barrier) {
+    barrier->Wait();
+  }
+  // 准备输入数据
+  int input_num = shape_production(INPUT_SHAPE);
+  std::vector<float> input_data(input_num, 1);
+  auto input_names = predictor->GetInputNames();
+  auto input_tensor = predictor->GetInputHandle(input_names[0]);
+  input_tensor->Reshape(INPUT_SHAPE);
+  input_tensor->CopyFromCpu(input_data.data());
+  // 执行预测
+  predictor->Run();
+  // 获取预测输出
+  auto output_names = predictor->GetOutputNames();
+  auto output_tensor = predictor->GetInputHandle(output_names[0]);
+  std::vector<int> output_shape = output_tensor->shape();
+  std::cout << "Output shape is " << shape_to_string(output_shape) << std::endl;
+}
+
+int main(int argc, char **argv) {
+  const size_t thread_num = 5;
+  std::vector<std::thread> threads(thread_num);
+  Barrier barrier(thread_num);
+  // 创建 5 个线程，并为每个线程开启一个单独的GPU Stream
+  for (size_t i = 0; i < threads.size(); ++i) {
+    threads[i] = std::thread([&barrier, i]() {
+      paddle_infer::Config config;
+      config.EnableUseGpu(100, 0);
+      config.SetModel(FLAGS_infer_model);
+      config.EnableGpuMultiStream();
+      test_main(config, &barrier);
+    });
+  }
+  for (auto& th : threads) {
+    th.join();
+  }
+}
+```
+
+## CUDNN 设置
+
+**注意：** 启用 CUDNN 的前提为已经启用 GPU，否则启用 CUDNN 无法生效。
+
+API定义如下：
+
+```c++
+// 启用 CUDNN 进行预测加速
+// 参数：None
+// 返回：None
+void EnableCUDNN();
+
+// 判断是否启用 CUDNN 
+// 参数：None
+// 返回：bool - 是否启用 CUDNN
+bool cudnn_enabled() const;
+```
+
+代码示例：
+
+```c++
+// 创建默认 Config 对象
+paddle_infer::Config config();
+
+// 启用 GPU 进行预测
+config.EnableUseGpu(100, 0);
+// 启用 CUDNN 进行预测加速
+config.EnableCUDNN();
+// 通过 API 获取 CUDNN 启用结果
+std::cout << "Enable CUDNN is: " << config.cudnn_enabled() << std::endl; // true
+
+// 禁用 GPU 进行预测
+config.DisableGpu();
+// 启用 CUDNN 进行预测加速 - 因为 GPU 被禁用，因此 CUDNN 启用不生效
+config.EnableCUDNN();
+// 通过 API 获取 CUDNN 启用结果
+std::cout << "Enable CUDNN is: " << config.cudnn_enabled() << std::endl; // false
+```
+
+## TensorRT 设置
+
+**注意：** 
+1. 启用 TensorRT 的前提为已经启用 GPU，否则启用 TensorRT 无法生效
+2. 对存在LoD信息的模型，如Bert, Ernie等NLP模型，必须使用动态 Shape
+3. 启用 TensorRT OSS 可以支持更多 plugin，详细参考 [TensorRT OSS](https://news.developer.nvidia.com/nvidia-open-sources-parsers-and-plugins-in-tensorrt/)
+
+更多 TensorRT 详细信息，请参考 [使用Paddle-TensorRT库预测](../../../optimize/paddle_trt)。
+
+API定义如下：
+
+```c++
+// 启用 TensorRT 进行预测加速
+// 参数：workspace_size     - 指定 TensorRT 使用的工作空间大小
+//      max_batch_size     - 设置最大的 batch 大小，运行时 batch 大小不得超过此限定值
+//      min_subgraph_size  - Paddle-TRT 是以子图的形式运行，为了避免性能损失，当子图内部节点个数
+//                           大于 min_subgraph_size 的时候，才会使用 Paddle-TRT 运行
+//      precision          - 指定使用 TRT 的精度，支持 FP32(kFloat32)，FP16(kHalf)，Int8(kInt8)
+//      use_static         - 若指定为 true，在初次运行程序的时候会将 TRT 的优化信息进行序列化到磁盘上，
+//                           下次运行时直接加载优化的序列化信息而不需要重新生成
+//      use_calib_mode     - 若要运行 Paddle-TRT INT8 离线量化校准，需要将此选项设置为 true
+// 返回：None
+void EnableTensorRtEngine(int workspace_size = 1 << 20,
+                          int max_batch_size = 1, int min_subgraph_size = 3,
+                          Precision precision = Precision::kFloat32,
+                          bool use_static = false,
+                          bool use_calib_mode = true);
+// 判断是否启用 TensorRT 
+// 参数：None
+// 返回：bool - 是否启用 TensorRT
+bool tensorrt_engine_enabled() const;
+
+// 设置 TensorRT 的动态 Shape
+// 参数：min_input_shape          - TensorRT 子图支持动态 shape 的最小 shape
+//      max_input_shape          - TensorRT 子图支持动态 shape 的最大 shape
+//      optim_input_shape        - TensorRT 子图支持动态 shape 的最优 shape
+//      disable_trt_plugin_fp16  - 设置 TensorRT 的 plugin 不在 fp16 精度下运行
+// 返回：None
+void SetTRTDynamicShapeInfo(
+      std::map<std::string, std::vector<int>> min_input_shape,
+      std::map<std::string, std::vector<int>> max_input_shape,
+      std::map<std::string, std::vector<int>> optim_input_shape,
+      bool disable_trt_plugin_fp16 = false);
+
+// 启用 TensorRT OSS 进行预测加速
+// 参数：None
+// 返回：None
+void EnableTensorRtOSS();
+
+// 判断是否启用 TensorRT OSS
+// 参数：None
+// 返回：bool - 是否启用 TensorRT OSS
+bool tensorrt_oss_enabled();
+```
+
+代码示例 (1)：使用 TensorRT FP32 / FP16 / INT8 进行预测
+
+```c++
+// 创建 Config 对象
+paddle_infer::Config config(FLAGS_infer_model + "/mobilenet");
+
+// 启用 GPU 进行预测
+config.EnableUseGpu(100, 0);
+
+// 启用 TensorRT 进行预测加速 - FP32
+config.EnableTensorRtEngine(1 << 20, 1, 3, 
+                            paddle_infer::PrecisionType::kFloat32, false, false);
+// 通过 API 获取 TensorRT 启用结果 - true
+std::cout << "Enable TensorRT is: " << config.tensorrt_engine_enabled() << std::endl;
+
+// 启用 TensorRT 进行预测加速 - FP16
+config.EnableTensorRtEngine(1 << 20, 1, 3, 
+                            paddle_infer::PrecisionType::kHalf, false, false);
+// 通过 API 获取 TensorRT 启用结果 - true
+std::cout << "Enable TensorRT is: " << config.tensorrt_engine_enabled() << std::endl;
+
+// 启用 TensorRT 进行预测加速 - Int8
+config.EnableTensorRtEngine(1 << 20, 1, 3, 
+                            paddle_infer::PrecisionType::kInt8, false, true);
+// 通过 API 获取 TensorRT 启用结果 - true
+std::cout << "Enable TensorRT is: " << config.tensorrt_engine_enabled() << std::endl;
+```
+
+代码示例 (2)：使用 TensorRT 动态 Shape 进行预测
+
+```c++
+// 创建 Config 对象
+paddle_infer::Config config(FLAGS_infer_model + "/mobilenet");
+
+// 启用 GPU 进行预测
+config.EnableUseGpu(100, 0);
+
+// 启用 TensorRT 进行预测加速 - Int8
+config.EnableTensorRtEngine(1 << 30, 1, 1,
+                            paddle_infer::PrecisionType::kInt8, false, true);
+// 设置模型输入的动态 Shape 范围
+std::map<std::string, std::vector<int>> min_input_shape = {{"image", {1, 1, 3, 3}}};
+std::map<std::string, std::vector<int>> max_input_shape = {{"image", {1, 1, 10, 10}}};
+std::map<std::string, std::vector<int>> opt_input_shape = {{"image", {1, 1, 3, 3}}};
+// 设置 TensorRT 的动态 Shape
+config.SetTRTDynamicShapeInfo(min_input_shape, max_input_shape, opt_input_shape);
+```
+
+代码示例 (3)：使用 TensorRT OSS 进行预测
+
+```c++
+// 创建 Config 对象
+paddle_infer::Config config(FLAGS_infer_model + "/mobilenet");
+
+// 启用 GPU 进行预测
+config.EnableUseGpu(100, 0);
+
+// 启用 TensorRT 进行预测加速
+config.EnableTensorRtEngine();
+// 启用 TensorRT OSS 进行预测加速
+config.EnableTensorRtOSS();
+
+// 通过 API 获取 TensorRT OSS 启用结果 - true
+std::cout << "Enable TensorRT is: " << config.tensorrt_oss_enabled() << std::endl;
+```
\ No newline at end of file
diff --git a/docs/api_reference/cxx_api_doc/Config/InternalUse.md b/docs/api_reference/cxx_api_doc/Config/InternalUse.md
new file mode 100644
index 0000000000000..66bf741adbdea
--- /dev/null
+++ b/docs/api_reference/cxx_api_doc/Config/InternalUse.md
@@ -0,0 +1,66 @@
+# 仅供内部使用
+
+API定义如下：
+
+```c++
+// 转化为 NativeConfig，不推荐使用
+// 参数：None
+// 返回：当前 Config 对应的 NativeConfig
+NativeConfig ToNativeConfig() const;
+
+// 设置是否使用Feed, Fetch OP，仅内部使用
+// 当使用 ZeroCopyTensor 时，需设置为 false
+// 参数：x - 是否使用Feed, Fetch OP，默认为 true
+// 返回：None
+void SwitchUseFeedFetchOps(int x = true);
+
+// 判断是否使用Feed, Fetch OP
+// 参数：None
+// 返回：bool - 是否使用Feed, Fetch OP
+bool use_feed_fetch_ops_enabled() const;
+
+// 设置是否需要指定输入 Tensor 的 Name，仅对内部 ZeroCopyTensor 有效
+// 参数：x - 是否指定输入 Tensor 的 Name，默认为 true
+// 返回：None
+void SwitchSpecifyInputNames(bool x = true);
+
+// 判断是否需要指定输入 Tensor 的 Name，仅对内部 ZeroCopyTensor 有效
+// 参数：None
+// 返回：bool - 是否需要指定输入 Tensor 的 Name
+bool specify_input_name() const;
+
+// 设置 Config 为无效状态，仅内部使用，保证每一个 Config 仅用来初始化一次 Predictor
+// 参数：None
+// 返回：None
+void SetInValid();
+
+// 判断当前 Config 是否有效
+// 参数：None
+// 返回：bool - 当前 Config 是否有效
+bool is_valid() const;
+```
+
+代码示例：
+
+```c++
+// 创建 Config 对象
+paddle_infer::Config config(FLAGS_infer_model + "/mobilenet");
+
+// 转化为 NativeConfig
+auto native_config = analysis_config->ToNativeConfig();
+
+// 禁用 Feed, Fetch OP
+config.SwitchUseFeedFetchOps(false);
+// 返回是否使用 Feed, Fetch OP - false
+std::cout << "UseFeedFetchOps is: " << config.use_feed_fetch_ops_enabled() << std::endl;
+
+// 设置需要指定输入 Tensor 的 Name
+config.SwitchSpecifyInputNames(true);
+// 返回是否需要指定输入 Tensor 的 Name - true
+std::cout << "Specify Input Name is: " << config.specify_input_name() << std::endl;
+
+// 设置 Config 为无效状态
+config.SetInValid();
+// 判断当前 Config 是否有效 - false
+std::cout << "Config validation is: " << config.is_valid() << std::endl;
+```
\ No newline at end of file
diff --git a/docs/api_reference/cxx_api_doc/Config/ModelConfig.md b/docs/api_reference/cxx_api_doc/Config/ModelConfig.md
new file mode 100644
index 0000000000000..2212689342104
--- /dev/null
+++ b/docs/api_reference/cxx_api_doc/Config/ModelConfig.md
@@ -0,0 +1,145 @@
+# 设置预测模型
+
+## 从文件中加载预测模型 - 非Combined模型 
+
+API定义如下：
+
+```c++
+// 设置模型文件路径，当需要从磁盘加载非 Combined 模型时使用
+// 参数：model_dir - 模型文件夹路径
+// 返回：None
+void SetModel(const std::string& model_dir);
+
+// 获取非combine模型的文件夹路径
+// 参数：None
+// 返回：string - 模型文件夹路径
+const std::string& model_dir();
+```
+
+代码示例：
+
+```c++
+// 字符串 model_dir 为非 Combined 模型文件夹路径
+std::string model_dir = "../assets/models/mobilenet_v1";
+
+// 创建默认 Config 对象
+paddle_infer::Config config();
+
+// 通过 API 设置模型文件夹路径
+config.SetModel(model_dir);
+
+// 通过 API 获取 config 中的模型路径
+std::cout << "Model Path is: " << config.model_dir() << std::endl;
+
+// 根据Config对象创建预测器对象
+auto predictor = paddle_infer::CreatePredictor(config);
+```
+
+## 从文件中加载预测模型 -  Combined 模型
+
+API定义如下：
+
+```c++
+// 设置模型文件路径，当需要从磁盘加载 Combined 模型时使用
+// 参数：prog_file_path - 模型文件路径
+//      params_file_path - 参数文件路径
+// 返回：None
+void SetModel(const std::string& prog_file_path,
+              const std::string& params_file_path);
+
+// 设置模型文件路径，当需要从磁盘加载 Combined 模型时使用。
+// 参数：x - 模型文件路径
+// 返回：None
+void SetProgFile(const std::string& x);
+
+
+// 设置参数文件路径，当需要从磁盘加载 Combined 模型时使用
+// 参数：x - 参数文件路径
+// 返回：None
+void SetParamsFile(const std::string& x);
+
+// 获取 Combined 模型的模型文件路径
+// 参数：None
+// 返回：string - 模型文件路径
+const std::string& prog_file();
+
+// 获取 Combined 模型的参数文件路径
+// 参数：None
+// 返回：string - 参数文件路径
+const std::string& params_file();
+```
+
+代码示例：
+
+```c++
+// 字符串 prog_file 为 Combined 模型的模型文件所在路径
+std::string prog_file = "../assets/models/mobilenet_v1/__model__";
+// 字符串 params_file 为 Combined 模型的参数文件所在路径
+std::string params_file = "../assets/models/mobilenet_v1/__params__";
+
+// 创建默认 Config 对象
+paddle_infer::Config config();
+// 通过 API 设置模型文件夹路径，
+config.SetModel(prog_file, params_file);
+// 注意：SetModel API与以下2行代码等同
+// config.SetProgFile(prog_file);
+// config.SetParamsFile(params_file);
+
+// 通过 API 获取 config 中的模型文件和参数文件路径
+std::cout << "Model file path is: " << config.prog_file() << std::endl;
+std::cout << "Model param path is: " << config.params_file() << std::endl;
+
+// 根据 Config 对象创建预测器对象
+auto predictor = paddle_infer::CreatePredictor(config);
+```
+
+## 从内存中加载预测模型
+
+API定义如下：
+
+```c++
+// 从内存加载模型
+// 参数：prog_buffer - 内存中模型结构数据
+//      prog_buffer_size - 内存中模型结构数据的大小
+//      params_buffer - 内存中模型参数数据
+//      params_buffer_size - 内存中模型参数数据的大小
+// 返回：None
+void SetModelBuffer(const char* prog_buffer, size_t prog_buffer_size,
+                    const char* params_buffer, size_t params_buffer_size);
+
+// 判断是否从内存中加载模型
+// 参数：None
+// 返回：bool - 是否从内存中加载模型
+bool model_from_memory() const;
+```
+
+代码示例：
+
+```c++
+// 定义文件读取函数
+std::string read_file(std::string filename) {
+  std::ifstream file(filename);
+  return std::string((std::istreambuf_iterator<char>(file)),
+                     std::istreambuf_iterator<char>());
+}
+
+// 设置模型文件和参数文件所在路径
+std::string prog_file = "../assets/models/mobilenet_v1/__model__";
+std::string params_file = "../assets/models/mobilenet_v1/__params__";
+
+// 加载模型文件到内存
+std::string prog_str = read_file(prog_file);
+std::string params_str = read_file(params_file);
+
+// 创建默认 Config 对象
+paddle_infer::Config config();
+// 从内存中加载模型
+config.SetModelBuffer(prog_str.c_str(), prog_str.size(),
+                      params_str.c_str(), params_str.size());
+
+// 通过 API 获取 config 中 model_from_memory 的值
+std::cout << "Load model from memory is: " << config.model_from_memory() << std::endl;
+
+// 根据 Confi 对象创建预测器对象
+auto predictor = paddle_infer::CreatePredictor(config);
+```
diff --git a/docs/api_reference/cxx_api_doc/Config/OptimConfig.md b/docs/api_reference/cxx_api_doc/Config/OptimConfig.md
new file mode 100644
index 0000000000000..dcdb1f4b64796
--- /dev/null
+++ b/docs/api_reference/cxx_api_doc/Config/OptimConfig.md
@@ -0,0 +1,110 @@
+# 设置模型优化方法
+
+## IR 优化
+
+**注意：** 关于自定义 IR 优化 Pass，请参考 [PaddlePassBuilder 类](../PaddlePassBuilder)
+
+API定义如下：
+
+```c++
+// 启用 IR 优化
+// 参数：x - 是否开启 IR 优化，默认打开
+// 返回：None
+void SwitchIrOptim(int x = true);
+
+// 判断是否开启 IR 优化 
+// 参数：None
+// 返回：bool - 是否开启 IR 优化
+bool ir_optim() const;
+
+// 设置是否在图分析阶段打印 IR，启用后会在每一个 PASS 后生成 dot 文件
+// 参数：x - 是否打印 IR，默认打开
+// 返回：None
+void SwitchIrDebug(int x = true);
+
+// 返回 pass_builder，用来自定义图分析阶段选择的 IR
+// 参数：None
+// 返回：PassStrategy - pass_builder对象
+PassStrategy* pass_builder() const;
+```
+
+代码示例：
+
+```c++
+// 创建 Config 对象
+paddle_infer::Config config(FLAGS_model_dir);
+
+// 开启 IR 优化
+config.SwitchIrOptim();
+// 开启 IR 打印
+config.SwitchIrDebug();
+
+// 得到 pass_builder 对象
+auto pass_builder = config.pass_builder();
+// 在 IR 优化阶段，去除 fc_fuse_pass
+pass_builder->DeletePass("fc_fuse_pass");
+
+// 通过 API 获取 IR 优化是否开启 - true
+std::cout << "IR Optim is: " << config.ir_optim() << std::endl;
+
+// 根据Config对象创建预测器对象
+auto predictor = paddle_infer::CreatePredictor(config);
+```
+
+运行结果示例：
+
+```bash
+# SwitchIrOptim 开启 IR 优化后，运行中会有如下 LOG 输出
+--- Running analysis [ir_graph_build_pass]
+--- Running analysis [ir_graph_clean_pass]
+--- Running analysis [ir_analysis_pass]
+--- Running IR pass [simplify_with_basic_ops_pass]
+--- Running IR pass [attention_lstm_fuse_pass]
+--- Running IR pass [seqconv_eltadd_relu_fuse_pass]
+...
+--- Running analysis [inference_op_replace_pass]
+--- Running analysis [ir_graph_to_program_pass]
+
+# SwitchIrDebug 开启 IR 打印后，运行结束之后会在目录下生成如下 DOT 文件
+-rw-r--r-- 1 root root  70K Nov 17 10:47 0_ir_simplify_with_basic_ops_pass.dot
+-rw-r--r-- 1 root root  72K Nov 17 10:47 10_ir_fc_gru_fuse_pass.dot
+-rw-r--r-- 1 root root  72K Nov 17 10:47 11_ir_graph_viz_pass.dot
+...
+-rw-r--r-- 1 root root  72K Nov 17 10:47 8_ir_mul_lstm_fuse_pass.dot
+-rw-r--r-- 1 root root  72K Nov 17 10:47 9_ir_graph_viz_pass.dot
+```
+
+## Lite 子图
+
+```c++ 
+// 启用 Lite 子图
+// 参数：precision_mode - Lite 子图的运行精度，默认为 FP32
+//      zero_copy      - 启用 zero_copy，lite 子图与 paddle inference 之间共享数据
+//      Passes_filter  - 设置 lite 子图的 pass
+//      ops_filter     - 设置不使用 lite 子图运行的 op
+// 返回：None
+void EnableLiteEngine(
+      AnalysisConfig::Precision precision_mode = Precision::kFloat32,
+      bool zero_copy = false,
+      const std::vector<std::string>& passes_filter = {},
+      const std::vector<std::string>& ops_filter = {});
+
+
+// 判断是否启用 Lite 子图
+// 参数：None
+// 返回：bool - 是否启用 Lite 子图
+bool lite_engine_enabled() const;
+```
+
+示例代码：
+
+```c++
+// 创建 Config 对象
+paddle_infer::Config config(FLAGS_model_dir);
+
+config.EnableUseGpu(100, 0);
+config.EnableLiteEngine(paddle_infer::PrecisionType::kFloat32);
+
+// 通过 API 获取 Lite 子图启用信息 - true
+std::cout << "Lite Engine is: " << config.lite_engine_enabled() << std::endl;
+```
\ No newline at end of file
diff --git a/docs/api_reference/cxx_api_doc/Config/OtherFunction.md b/docs/api_reference/cxx_api_doc/Config/OtherFunction.md
new file mode 100644
index 0000000000000..28be548f87a41
--- /dev/null
+++ b/docs/api_reference/cxx_api_doc/Config/OtherFunction.md
@@ -0,0 +1,179 @@
+# 启用内存优化
+
+API定义如下：
+
+```c++
+// 开启内/显存复用，具体降低内存效果取决于模型结构。
+// 参数：None
+// 返回：None
+void EnableMemoryOptim();
+
+// 判断是否开启内/显存复用
+// 参数：None
+// 返回：bool - 是否开启内/显存复用
+bool enable_memory_optim() const;
+```
+
+代码示例：
+
+```c++
+// 创建 Config 对象
+paddle_infer::Config config(FLAGS_infer_model + "/mobilenet");
+
+// 开启 CPU 显存优化
+config.EnableMemoryOptim();
+// 通过 API 获取 CPU 是否已经开启显存优化 - true
+std::cout << "CPU Mem Optim is: " << config.enable_memory_optim() << std::endl;
+
+// 启用 GPU 进行预测
+config.EnableUseGpu(100, 0);
+// 开启 GPU 显存优化
+config.EnableMemoryOptim();
+// 通过 API 获取 GPU 是否已经开启显存优化 - true
+std::cout << "GPU Mem Optim is: " << config.enable_memory_optim() << std::endl;
+```
+
+# 设置缓存路径
+
+**注意：** 如果当前使用的为 TensorRT INT8 且设置从内存中加载模型，则必须通过 `SetOptimCacheDir` 来设置缓存路径。
+
+
+API定义如下：
+
+```c++
+// 设置缓存路径
+// 参数：opt_cache_dir - 缓存路径
+// 返回：None
+void SetOptimCacheDir(const std::string& opt_cache_dir);
+```
+
+代码示例：
+
+```c++
+// 创建 Config 对象
+paddle_infer::Config config(FLAGS_infer_model + "/mobilenet");
+
+// 设置缓存路径
+config.SetOptimCacheDir(FLAGS_infer_model + "/OptimCacheDir");
+```
+
+# FC Padding
+
+API定义如下：
+
+```c++
+// 禁用 FC Padding
+// 参数：None
+// 返回：None
+void DisableFCPadding();
+
+// 判断是否启用 FC Padding
+// 参数：None
+// 返回：bool - 是否启用 FC Padding
+bool use_fc_padding() const;
+```
+
+代码示例：
+
+```c++
+// 创建 Config 对象
+paddle_infer::Config config(FLAGS_infer_model + "/mobilenet");
+
+// 禁用 FC Padding
+config.DisableFCPadding();
+
+// 通过 API 获取是否禁用 FC Padding - false
+std::cout << "Disable FC Padding is: " << config.use_fc_padding() << std::endl;
+```
+
+# Profile 设置
+
+API定义如下：
+
+```c++
+// 打开 Profile，运行结束后会打印所有 OP 的耗时占比。
+// 参数：None
+// 返回：None
+void EnableProfile();
+
+// 判断是否开启 Profile
+// 参数：None
+// 返回：bool - 是否开启 Profile
+bool profile_enabled() const;
+```
+
+代码示例：
+
+```c++
+// 创建 Config 对象
+paddle_infer::Config config(FLAGS_infer_model + "/mobilenet");
+
+// 打开 Profile
+config.EnableProfile();
+
+// 判断是否开启 Profile - true
+std::cout << "Profile is: " << config.profile_enabled() << std::endl;
+```
+
+输出的 Profile 的结果如下：
+
+```bash
+------------------------->     Profiling Report     <-------------------------
+
+Place: CPU
+Time unit: ms
+Sorted by total time in descending order in the same thread
+
+-------------------------     Overhead Summary      -------------------------
+
+Total time: 1085.33
+  Computation time       Total: 1066.24     Ratio: 98.2411%
+  Framework overhead     Total: 19.0902     Ratio: 1.75893%
+
+-------------------------     GpuMemCpy Summary     -------------------------
+
+GpuMemcpy                Calls: 0           Total: 0           Ratio: 0%
+
+-------------------------       Event Summary       -------------------------
+
+Event                            Calls       Total       Min.        Max.        Ave.        Ratio.
+thread0::conv2d                  210         319.734     0.815591    6.51648     1.52254     0.294595
+thread0::load                    137         284.596     0.114216    258.715     2.07735     0.26222
+thread0::depthwise_conv2d        195         266.241     0.955945    2.47858     1.36534     0.245308
+thread0::elementwise_add         210         122.969     0.133106    2.15806     0.585568    0.113301
+thread0::relu                    405         56.1807     0.021081    0.585079    0.138718    0.0517635
+thread0::batch_norm              195         25.8073     0.044304    0.33896     0.132345    0.0237783
+thread0::fc                      15          7.13856     0.451674    0.714895    0.475904    0.0065773
+thread0::pool2d                  15          1.48296     0.09054     0.145702    0.0988637   0.00136636
+thread0::softmax                 15          0.941837    0.032175    0.460156    0.0627891   0.000867786
+thread0::scale                   15          0.240771    0.013394    0.030727    0.0160514   0.000221841
+```
+
+# Log 设置
+
+API定义如下：
+
+```c++
+// 去除 Paddle Inference 运行中的 LOG
+// 参数：None
+// 返回：None
+void DisableGlogInfo();
+
+// 判断是否禁用 LOG
+// 参数：None
+// 返回：bool - 是否禁用 LOG
+bool glog_info_disabled() const;
+```
+
+代码示例：
+
+```c++
+// 创建 Config 对象
+paddle_infer::Config config(FLAGS_infer_model + "/mobilenet");
+
+// 去除 Paddle Inference 运行中的 LOG
+config.DisableGlogInfo();
+
+// 判断是否禁用 LOG - true
+std::cout << "GLOG INFO is: " << config.glog_info_disabled() << std::endl;
+```
\ No newline at end of file
diff --git a/docs/api_reference/cxx_api_doc/Config/XPUConfig.md b/docs/api_reference/cxx_api_doc/Config/XPUConfig.md
new file mode 100644
index 0000000000000..c969cf29c4171
--- /dev/null
+++ b/docs/api_reference/cxx_api_doc/Config/XPUConfig.md
@@ -0,0 +1,21 @@
+
+# 使用 XPU 进行预测
+
+API定义如下：
+
+```c++
+// 启用 XPU 进行预测
+// 参数：l3_workspace_size - l3 cache 分配的显存大小
+// 返回：None
+void EnableXpu(int l3_workspace_size = 0xfffc00);
+```
+
+代码示例：
+
+```c++
+// 创建 Config 对象
+paddle_infer::Config config(FLAGS_model_dir);
+
+// 启用 XPU，并设置 l3 cache 大小为 100M
+config.EnableXpu(100);
+```
diff --git a/docs/api_reference/cxx_api_doc/Config_index.rst b/docs/api_reference/cxx_api_doc/Config_index.rst
new file mode 100644
index 0000000000000..977945ad8ed91
--- /dev/null
+++ b/docs/api_reference/cxx_api_doc/Config_index.rst
@@ -0,0 +1,15 @@
+Config 类
+================
+
+.. toctree::
+    :maxdepth: 2
+    :numbered: 3
+
+    Config/ConfigClass
+    Config/ModelConfig
+    Config/CPUConfig
+    Config/GPUConfig
+    Config/XPUConfig
+    Config/OptimConfig
+    Config/OtherFunction
+    Config/InternalUse
\ No newline at end of file
diff --git a/docs/api_reference/cxx_api_doc/CreatePredictor.md b/docs/api_reference/cxx_api_doc/CreatePredictor.md
new file mode 100644
index 0000000000000..a1e0c10dc783e
--- /dev/null
+++ b/docs/api_reference/cxx_api_doc/CreatePredictor.md
@@ -0,0 +1,38 @@
+# CreatePredictor 方法
+
+API定义如下：
+
+```c++
+// 根据 Config 构建预测执行对象 Predictor
+// 参数: config - 用于构建 Predictor 的配置信息
+// 返回: std::shared_ptr<Predictor> - 预测对象的智能指针
+std::shared_ptr<Predictor> CreatePredictor(const Config& config);
+```
+
+代码示例:
+
+```c++
+// 创建 Config
+paddle_infer::Config config("../assets/models/mobilenet_v1");
+
+// 根据 Config 创建 Predictor
+auto predictor = paddle_infer::CreatePredictor(config);
+```
+
+# GetVersion 方法
+
+API定义如下：
+
+```c++
+// 获取 Paddle 版本信息
+// 参数: NONE
+// 返回: std::string - Paddle 版本信息
+std::string GetVersion();
+```
+
+代码示例:
+
+```c++
+// 获取 Paddle 版本信息
+std::string paddle_version = paddle_infer::GetVersion();
+```
\ No newline at end of file
diff --git a/docs/api_reference/cxx_api_doc/Enum.md b/docs/api_reference/cxx_api_doc/Enum.md
new file mode 100644
index 0000000000000..5d85175365bd6
--- /dev/null
+++ b/docs/api_reference/cxx_api_doc/Enum.md
@@ -0,0 +1,108 @@
+# 枚举类型
+
+## DataType
+
+DataType为模型中Tensor的数据精度，默认值为 `FLOAT32`。枚举变量与 API 定义如下：
+
+```c++
+// DataType 枚举类型定义
+enum DataType {
+  FLOAT32,
+  INT64,
+  INT32,
+  UINT8,
+};
+
+// 获取各个 DataType 对应的字节数
+// 参数：dtype - DataType 枚举
+// 输出：int - 字节数
+int GetNumBytesOfDataType(DataType dtype)
+```
+
+代码示例：
+
+
+```c++
+// 创建 FLOAT32 类型 DataType
+auto data_type = paddle_infer::DataType::FLOAT32;
+
+// 输出 data_type 的字节数 - 4
+std::cout << paddle_infer::GetNumBytesOfDataType(data_type) << std::endl;
+```
+
+## PrecisionType
+
+PrecisionType设置模型的运行精度，默认值为 `kFloat32(float32)`。枚举变量定义如下：
+
+```c++
+// PrecisionType 枚举类型定义
+enum class PrecisionType {
+  kFloat32 = 0,  ///< fp32
+  kInt8,         ///< int8
+  kHalf,         ///< fp16
+};
+```
+
+代码示例：
+
+```c++
+// 创建 Config 对象
+paddle_infer::Config config(FLAGS_infer_model + "/mobilenet");
+
+// 启用 GPU 进行预测
+config.EnableUseGpu(100, 0);
+
+// 启用 TensorRT 进行预测加速 - FP16
+config.EnableTensorRtEngine(1 << 20, 1, 3, 
+                            paddle_infer::PrecisionType::kHalf, false, false);
+```
+
+
+## PlaceType
+
+PlaceType为目标设备硬件类型，用户可以根据应用场景选择硬件平台类型。枚举变量定义如下：
+
+```c++
+// PlaceType 枚举类型定义
+enum class PlaceType { kUNK = -1, kCPU, kGPU };
+```
+
+代码示例：
+
+```c++
+// 创建 Config 对象
+paddle_infer::Config config;
+
+// 启用 GPU 预测
+config.EnableUseGpu(100, 0);
+config.SetModel(model_dir);
+
+// 创建 Predictor
+auto predictor = paddle_infer::CreatePredictor(config);
+
+// 准备输入数据
+int input_num = shape_production(INPUT_SHAPE);
+std::vector<float> input_data(input_num, 1);
+
+// 准备输入 Tensor
+auto input_names = predictor->GetInputNames();
+auto input_tensor = predictor->GetInputHandle(input_names[0]);
+input_tensor->Reshape({1, 3, 224, 224});
+input_tensor->CopyFromCpu(input_data.data());
+
+// 执行预测
+predictor->Run();
+
+// 获取 Output Tensor
+auto output_names = predictor->GetOutputNames();
+auto output_tensor = predictor->GetInputHandle(output_names[0]);
+
+// 获取 Output Tensor 的 PlaceType 和 数据指针
+paddle_infer::PlaceType place;
+int size = 0;
+auto* out_data = output_tensor->data<float>(&place, &size);
+
+// 输出 Place 结果 - true
+std::cout << (place == paddle_infer::PlaceType::kGPU) << std::endl;
+std::cout << size / sizeof(float) << std::endl;
+```
\ No newline at end of file
diff --git a/docs/api_reference/cxx_api_doc/PaddlePassBuilder.md b/docs/api_reference/cxx_api_doc/PaddlePassBuilder.md
new file mode 100644
index 0000000000000..edb4caa8607bc
--- /dev/null
+++ b/docs/api_reference/cxx_api_doc/PaddlePassBuilder.md
@@ -0,0 +1,136 @@
+
+# PaddlePassBuilder 类
+
+**注意：** PaddlePassBuilder 对象通过 `Config` 的 `pass_builder` 方法进行获取。其中存在2个成员对象 AnalysisPasses 和 Passes,AnalysisPasses 独立于 Passes 之外，仅 `AppendAnalysisPass` 和 `AnalysisPasses` 两个 API 能对其进行修改和读取，其余 API 的操作对象都仅限于Passes。
+
+类及方法定义如下：
+
+```c++
+// 设置 IR 图分析阶段的 passes
+// 参数：passes - IR 图分析阶段的 passes 的字符串列表
+// 返回：None
+void SetPasses(std::initializer_list<std::string> passes);
+
+// 在 Passes 末尾添加 pass
+// 参数：pass_type - 需要添加的 pass 字符串
+// 返回：None
+void AppendPass(const std::string &pass_type);
+
+// 在 Passes 中的第 idx 位置插入 pass
+// 参数：idx - 插入的 index 位置
+//      pass_type - 需要插入的 pass 字符串
+// 返回：None
+void InsertPass(size_t idx, const std::string &pass_type);
+
+// 删除第 idx 位置的pass
+// 参数：idx - 删除的 index 位置
+// 返回：None
+void DeletePass(size_t idx);
+
+// 删除字符串匹配为 pass_type 的 pass
+// 参数：pass_type - 需要删除的 pass 字符串
+// 返回：None
+void DeletePass(const std::string &pass_type);
+
+// 清空所有 IR 优化中的 Passes
+// 参数：None
+// 返回：None
+void ClearPasses();
+
+// 启用Debug, 会在每一个 PASS 优化后生成当前计算图 DOT
+// 即在每一个 fuse pass 之后添加一个 graph_viz_pass 进行 pass 可视化
+// 参数：None
+// 返回：None
+void TurnOnDebug();
+
+// 获取 IR 图分析阶段的 Passes 中的可读信息
+// 参数：None
+// 返回：std::string - 所有 Passes 的可读信息
+std::string DebugString();
+
+// 获取 IR 图分析阶段的所有 Passes
+// 参数：None
+// 返回：std::vector<std::string> - 所有 Passes 字符串列表
+const std::vector<std::string> &AllPasses();
+
+// 添加 Analysis Pass
+// 参数：pass - 需要添加的 Analysis Pass 字符串表示
+// 返回：None
+void AppendAnalysisPass(const std::string &pass);
+
+// 获取 IR 图分析阶段的所有 Analysis Passes
+// 参数：None
+// 返回：std::vector<std::string> - 所有 Analysis Passes 字符串列表
+std::vector<std::string> AnalysisPasses() const;
+```
+
+自定义 IR Pass 代码示例：
+
+```c++
+// 构造 Config 对象
+paddle_infer::Config config(FLAGS_infer_model);
+
+// 开启 IR 优化
+config.SwitchIrOptim();
+
+// 得到 pass_builder 对象
+auto pass_builder = config.pass_builder();
+
+// 获取 pass_builder 中的所有 Passes
+const std::vector<std::string> all_passes = pass_builder->AllPasses();
+
+// all_passes 中返回结果如下:
+// simplify_with_basic_ops_pass
+// attention_lstm_fuse_pass
+// ...
+// runtime_context_cache_pass
+
+// 清空所有 Passes
+pass_builder->ClearPasses();
+// 设置 Passes
+pass_builder->SetPasses({"attention_lstm_fuse_pass", "fc_gru_fuse_pass"});
+// 在末尾处添加pass
+pass_builder->AppendPass("fc_fuse_pass");
+// 删除 Passes
+pass_builder->DeletePass("fc_fuse_pass");
+// 在 idx = 0 的位置添加 pass
+pass_builder->InsertPass(0, "fc_fuse_pass");
+// 删除 idx = 0 所在位置的 pass
+pass_builder->DeletePass(0);
+// 启用Debug, 会在每一个 PASS 优化后生成当前计算图 DOT
+// 即在每一个 pass 之后添加一个 graph_viz_pass
+pass_builder->TurnOnDebug();
+// 获取 IR 图分析阶段的 Passes 中的可读信息
+std::cout << pass_builder->DebugString() << std::endl;
+
+// 运行以上代码得到的输出结果如下：
+//  - attention_lstm_fuse_pass
+//  - graph_viz_pass
+//  - fc_gru_fuse_pass
+//  - graph_viz_pass
+```
+
+对 Analysis Pass 进行操作和读取示例：
+
+```c++
+// 构造 Config 对象
+paddle_infer::Config config(FLAGS_infer_model);
+
+// 开启 IR 优化
+config.SwitchIrOptim();
+
+// 得到 pass_builder 对象
+auto pass_builder = config.pass_builder();
+
+// 添加 analysis pass
+pass_builder->AppendAnalysisPass("ir_analysis_pass");
+
+// 获取 pass_builder 中的所有 Analysis Passes
+const std::vector<std::string> analysis_passes = pass_builder->AnalysisPasses();
+
+// analysis_passes 中返回结果如下:
+// ir_graph_build_pass
+// ir_graph_clean_pass
+// ...
+// ir_graph_to_program_pass
+```
\ No newline at end of file
diff --git a/docs/api_reference/cxx_api_doc/Predictor.md b/docs/api_reference/cxx_api_doc/Predictor.md
new file mode 100644
index 0000000000000..4cf43f9707ebd
--- /dev/null
+++ b/docs/api_reference/cxx_api_doc/Predictor.md
@@ -0,0 +1,119 @@
+# Predictor 类
+
+Paddle Inference的预测器，由 `CreatePredictor` 根据 `Config` 进行创建。用户可以根据Predictor提供的接口设置输入数据、执行模型预测、获取输出等。
+
+## 获取输入输出
+
+API 定义如下：
+
+```c++
+// 获取所有输入 Tensor 的名称
+// 参数：None
+// 返回：std::vector<std::string> - 所有输入 Tensor 的名称
+std::vector<std::string> GetInputNames();
+
+// 根据名称获取输入 Tensor 的句柄
+// 参数：name - Tensor 的名称
+// 返回：std::unique_ptr<Tensor> - 指向 Tensor 的指针
+std::unique_ptr<Tensor> GetInputHandle(const std::string& name);
+
+// 获取所有输出 Tensor 的名称
+// 参数：None
+// 返回：std::vector<std::string> - 所有输出 Tensor 的名称
+std::vector<std::string> GetOutputNames();
+
+// 根据名称获取输出 Tensor 的句柄
+// 参数：name - Tensor 的名称
+// 返回：std::unique_ptr<Tensor> - 指向 Tensor 的指针
+std::unique_ptr<Tensor> GetOutputHandle(const std::string& name);
+```
+
+代码示例：
+
+```c++
+// 构造 Config 对象
+paddle_infer::Config config(FLAGS_infer_model);
+
+// 创建 Predictor
+auto predictor = paddle_infer::CreatePredictor(config);
+
+// 准备输入数据
+int input_num = shape_production(INPUT_SHAPE);
+std::vector<float> input_data(input_num, 1);
+
+// 准备输入 Tensor
+auto input_names = predictor->GetInputNames();
+auto input_tensor = predictor->GetInputHandle(input_names[0]);
+input_tensor->Reshape({1, 3, 224, 224});
+input_tensor->CopyFromCpu(input_data.data());
+
+// 执行预测
+predictor->Run();
+
+// 获取 Output Tensor
+auto output_names = predictor->GetOutputNames();
+auto output_tensor = predictor->GetInputHandle(output_names[0]);
+```
+
+## 运行和生成
+
+API 定义如下：
+
+```c++
+// 执行模型预测，需要在设置输入数据后调用
+// 参数：None
+// 返回：None
+bool Run();
+
+// 根据该Predictor，克隆一个新的Predictor，两个Predictor之间共享权重
+// 参数：None
+// 返回：std::unique_ptr<Predictor> - 新的 Predictor
+std::unique_ptr<Predictor> Clone();
+
+// 释放中间Tensor
+// 参数：None
+// 返回：None
+void ClearIntermediateTensor();
+
+// 释放内存池中的所有临时 Tensor
+// 参数：None
+// 返回：uint64_t - 释放的内存字节数
+uint64_t TryShrinkMemory();
+```
+
+代码示例：
+
+```c++
+// 创建 Predictor
+auto predictor = paddle_infer::CreatePredictor(config);
+
+// 准备输入数据
+int input_num = shape_production(INPUT_SHAPE);
+std::vector<float> input_data(input_num, 1);
+
+// 准备输入 Tensor
+auto input_names = predictor->GetInputNames();
+auto input_tensor = predictor->GetInputHandle(input_names[0]);
+input_tensor->Reshape({1, 3, 224, 224});
+input_tensor->CopyFromCpu(input_data.data());
+
+// 执行预测
+predictor->Run();
+
+// 获取 Output Tensor
+auto output_names = predictor->GetOutputNames();
+auto output_t = predictor->GetOutputHandle(output_names[0]);
+std::vector<int> output_shape = output_t->shape();
+int out_num = std::accumulate(output_shape.begin(), output_shape.end(), 
+                              1, std::multiplies<int>());
+// 获取 Output 数据
+std::vector<float> out_data;
+out_data.resize(out_num);
+output_t->CopyToCpu(out_data.data());
+
+// 释放中间Tensor
+predictor->ClearIntermediateTensor();
+
+// 释放内存池中的所有临时 Tensor
+predictor->TryShrinkMemory();
+```
\ No newline at end of file
diff --git a/docs/api_reference/cxx_api_doc/PredictorPool.md b/docs/api_reference/cxx_api_doc/PredictorPool.md
new file mode 100644
index 0000000000000..ea222aa814848
--- /dev/null
+++ b/docs/api_reference/cxx_api_doc/PredictorPool.md
@@ -0,0 +1,32 @@
+#  PredictorPool 类
+
+`PredictorPool` 对 `Predictor` 进行了简单的封装，通过传入config和thread的数目来完成初始化，在每个线程中，根据自己的线程id直接从池中取出对应的 `Predictor` 来完成预测过程。
+
+构造函数和API定义如下：
+
+```c++
+// PredictorPool构造函数
+// 参数：config - Config 对象
+//      size - Predictor 对象数量
+PredictorPool(const Config& config, size_t size = 1);
+
+// 根据线程 ID 取出该线程对应的 Predictor
+// 参数：idx - 线程 ID
+// 返回：Predictor* - 线程 ID 对应的 Predictor 指针
+Predictor* Retrive(size_t idx);
+```
+
+代码示例
+
+```c++
+// 构造 Config 对象
+paddle_infer::Config config(FLAGS_infer_model);
+// 启用 GPU 预测
+config.EnableUseGpu(100, 0);
+
+// 根据 Config 对象创建 PredictorPool
+paddle_infer::PredictorPool pred_pool(config, 4);
+
+// 获取 ID 为 2 的 Predictor 对象
+auto pred = pred_pool.Retrive(2);
+```
\ No newline at end of file
diff --git a/docs/api_reference/cxx_api_doc/Tensor.md b/docs/api_reference/cxx_api_doc/Tensor.md
new file mode 100644
index 0000000000000..e5d02ede1ca81
--- /dev/null
+++ b/docs/api_reference/cxx_api_doc/Tensor.md
@@ -0,0 +1,115 @@
+#  Tensor 类
+
+Tensor 是 Paddle Inference 的数据组织形式，用于对底层数据进行封装并提供接口对数据进行操作，包括设置 Shape、数据、LoD 信息等。
+
+**注意：** 应使用 `Predictor` 的 `GetInputHandle` 和 `GetOuputHandle` 接口获取输入输出 `Tensor`。
+
+Tensor 类的API定义如下：
+
+```c++
+// 设置 Tensor 的维度信息
+// 参数：shape - 维度信息
+// 返回：None
+void Reshape(const std::vector<int>& shape);
+
+// 从 CPU 获取数据，设置到 Tensor 内部
+// 参数：data - CPU 数据指针
+// 返回：None
+template <typename T>
+void CopyFromCpu(const T* data);
+
+// 从 Tensor 中获取数据到 CPU
+// 参数：data - CPU 数据指针
+// 返回：None
+template <typename T>
+void CopyToCpu(T* data);
+
+// 获取 Tensor 底层数据指针，用于设置 Tensor 输入数据
+// 在调用这个 API 之前需要先对输入 Tensor 进行 Reshape
+// 参数：place - 获取 Tensor 的 PlaceType
+// 返回：数据指针
+template <typename T>
+T* mutable_data(PlaceType place);
+
+// 获取 Tensor 底层数据的常量指针，用于读取 Tensor 输出数据
+// 参数：place - 获取 Tensor 的 PlaceType
+//      size - 获取 Tensor 的 size
+// 返回：数据指针
+template <typename T>
+T* data(PlaceType* place, int* size) const;
+
+// 设置 Tensor 的 LoD 信息
+// 参数：x - Tensor 的 LoD 信息
+// 返回：None
+void SetLoD(const std::vector<std::vector<size_t>>& x);
+
+// 获取 Tensor 的 LoD 信息
+// 参数：None
+// 返回：std::vector<std::vector<size_t>> - Tensor 的 LoD 信息
+std::vector<std::vector<size_t>> lod() const;
+
+// 获取 Tensor 的 DataType
+// 参数：None
+// 返回：DataType - Tensor 的 DataType
+DataType type() const;
+
+// 获取 Tensor 的维度信息
+// 参数：None
+// 返回：std::vector<int> - Tensor 的维度信息
+std::vector<int> shape() const;
+
+// 获取 Tensor 的 Name
+// 参数：None
+// 返回：std::string& - Tensor 的 Name
+const std::string& name() const;
+```
+
+代码示例：
+
+```c++
+// 构造 Config 对象
+paddle_infer::Config config(FLAGS_infer_model);
+
+// 创建 Predictor
+auto predictor = paddle_infer::CreatePredictor(config);
+
+// 准备输入数据
+int input_num = shape_production(INPUT_SHAPE);
+std::vector<float> input_data(input_num, 1);
+
+// 获取输入 Tensor
+auto input_names = predictor->GetInputNames();
+auto input_tensor = predictor->GetInputHandle(input_names[0]);
+
+// 设置输入 Tensor 的维度信息
+input_tensor->Reshape(INPUT_SHAPE);
+// 获取输入 Tensor 的 Name
+auto name = input_tensor->name();
+
+//  方式1: 通过 mutable_data 设置输入数据
+std::copy_n(input_data.begin(), input_data.size(),
+            input_tensor->mutable_data<float>(PaddlePlace::kCPU));
+
+//  方式2: 通过 CopyFromCpu 设置输入数据
+input_tensor->CopyFromCpu(input_data.data());
+
+// 执行预测
+predictor->Run();
+
+// 获取 Output Tensor
+auto output_names = predictor->GetOutputNames();
+auto output_tensor = predictor->GetInputHandle(output_names[0]);
+
+// 获取 Output Tensor 的维度信息
+std::vector<int> output_shape = output_tensor->shape();
+
+// 方式1: 通过 data 获取 Output Tensor 的数据
+paddle_infer::PlaceType place;
+int size = 0;
+auto* out_data = output_tensor->data<float>(&place, &size);
+
+// 方式2: 通过 CopyToCpu 获取 Output Tensor 的数据
+std::vector<float> output_data;
+output_data.resize(output_size);
+output_tensor->CopyToCpu(output_data.data());
+```
\ No newline at end of file
diff --git a/docs/api_reference/cxx_api_index.rst b/docs/api_reference/cxx_api_index.rst
new file mode 100644
index 0000000000000..1a44cc0b16b17
--- /dev/null
+++ b/docs/api_reference/cxx_api_index.rst
@@ -0,0 +1,14 @@
+C++ API 文档
+================
+
+.. toctree::
+    :maxdepth: 3
+
+    cxx_api_doc/CreatePredictor
+    cxx_api_doc/Config_index.rst
+    cxx_api_doc/PaddlePassBuilder
+    cxx_api_doc/Predictor
+    cxx_api_doc/PredictorPool
+    cxx_api_doc/Tensor
+    cxx_api_doc/Enum
+    
diff --git a/docs/api_reference/python_api_doc/Config/CPUConfig.md b/docs/api_reference/python_api_doc/Config/CPUConfig.md
new file mode 100644
index 0000000000000..71d31a887ef55
--- /dev/null
+++ b/docs/api_reference/python_api_doc/Config/CPUConfig.md
@@ -0,0 +1,121 @@
+# 使用 CPU 进行预测
+
+**注意：**
+1. 在 CPU 型号允许的情况下，进行预测库下载或编译试尽量使用带 AVX 和 MKL 的版本
+2. 可以尝试使用 Intel 的 MKLDNN 进行 CPU 预测加速，默认 CPU 不启用 MKLDNN
+3. 在 CPU 可用核心数足够时，可以通过设置 `set_cpu_math_library_num_threads` 将线程数调高一些，默认线程数为 1
+
+## CPU 设置
+
+API定义如下：
+
+```python
+# 设置 CPU Blas 库计算线程数
+# 参数：cpu_math_library_num_threads - blas库计算线程数
+# 返回：None
+paddle.inference.Config.set_cpu_math_library_num_threads(cpu_math_library_num_threads: int)
+
+# 获取 CPU Blas 库计算线程数
+# 参数：None
+# 返回：int - cpu blas库计算线程数
+paddle.inference.Config.cpu_math_library_num_threads()
+```
+
+代码示例：
+
+```python
+# 引用 paddle inference 预测库
+import paddle.inference as paddle_infer
+
+# 创建 config
+config = paddle_infer.Config()
+
+# 设置 CPU Blas 库线程数为 10
+config.set_cpu_math_library_num_threads(10)
+
+# 通过 API 获取 CPU 信息 - 10
+print(config.cpu_math_library_num_threads())
+```
+
+## MKLDNN 设置
+
+**注意：** 
+1. 启用 MKLDNN 的前提为已经使用 CPU 进行预测，否则启用 MKLDNN 无法生效
+2. 启用 MKLDNN BF16 要求 CPU 型号可以支持 AVX512，否则无法启用 MKLDNN BF16
+3. `set_mkldnn_cache_capacity` 请参考 <a class="reference external" href="https://github.com/PaddlePaddle/FluidDoc/blob/develop/doc/fluid/design/mkldnn/caching/caching.md">MKLDNN cache设计文档</a>
+
+API定义如下：
+
+```python
+# 启用 MKLDNN 进行预测加速
+# 参数：None
+# 返回：None
+paddle.inference.Config.enable_mkldnn()
+
+# 判断是否启用 MKLDNN 
+# 参数：None
+# 返回：bool - 是否启用 MKLDNN
+paddle.inference.Config.mkldnn_enabled()
+
+# 设置 MKLDNN 针对不同输入 shape 的 cache 容量大小
+# 参数：int - cache 容量大小
+# 返回：None
+paddle.inference.Config.set_mkldnn_cache_capacity(capacity: int=0)
+
+# 指定使用 MKLDNN 加速的 OP 集合
+# 参数：使用 MKLDNN 加速的 OP 集合
+# 返回：None
+paddle.inference.Config.set_mkldnn_op(op_list: Set[str])
+
+# 启用 MKLDNN BFLOAT16
+# 参数：None
+# 返回：None
+paddle.inference.Config.enable_mkldnn_bfloat16()
+
+
+# 指定使用 MKLDNN BFLOAT16 加速的 OP 集合
+# 参数：使用 MKLDNN BFLOAT16 加速的 OP 集合
+# 返回：None
+paddle.inference.Config.set_bfloat16_op(op_list: Set[str])
+```
+
+代码示例 (1)：使用 MKLDNN 进行预测
+
+```python
+# 引用 paddle inference 预测库
+import paddle.inference as paddle_infer
+
+# 创建 config
+config = paddle_infer.Config("./mobilenet_v1")
+
+# 启用 MKLDNN 进行预测
+config.enable_mkldnn()
+
+# 通过 API 获取 MKLDNN 启用结果 - true
+print(config.mkldnn_enabled())
+
+# 设置 MKLDNN 的 cache 容量大小
+config.set_mkldnn_cache_capacity(1)
+
+# 设置启用 MKLDNN 进行加速的 OP 列表
+config.set_mkldnn_op({"softmax", "elementwise_add", "relu"})
+```
+
+代码示例 (2)：使用 MKLDNN BFLOAT16 进行预测
+
+```python
+# 引用 paddle inference 预测库
+import paddle.inference as paddle_infer
+
+# 创建 config
+config = paddle_infer.Config("./mobilenet_v1")
+
+# 启用 MKLDNN 进行预测
+config.enable_mkldnn()
+
+# 启用 MKLDNN BFLOAT16 进行预测
+config.enable_mkldnn_bfloat16()
+
+# 设置启用 MKLDNN BFLOAT16 的 OP 列表
+config.set_bfloat16_op({"conv2d"})
+```
\ No newline at end of file
diff --git a/docs/api_reference/python_api_doc/Config/ConfigClass.md b/docs/api_reference/python_api_doc/Config/ConfigClass.md
new file mode 100644
index 0000000000000..ae8de85463c92
--- /dev/null
+++ b/docs/api_reference/python_api_doc/Config/ConfigClass.md
@@ -0,0 +1,61 @@
+# Config 类定义
+
+`Config` 类为用于配置构建 `Predictor` 对象的配置信息，如模型路径、是否开启gpu等等。
+
+构造函数定义如下：
+
+```python
+# Config 类定义，输入为 None
+class paddle.inference.Config()
+
+# Config 类定义，输入为其他 Config 对象
+class paddle.inference.Config(config: Config)
+
+# Config 类定义，输入为非 Combine 模型的文件夹路径
+class paddle.inference.Config(model_dir: str)
+
+# Config 类定义，输入分别为 Combine 模型的模型文件路径和参数文件路径
+class paddle.inference.Config(prog_file: str, params_file: str)
+```
+
+代码示例 (1)：加载预测模型 - 非Combined模型
+
+```python
+# 引用 paddle inference 预测库
+import paddle.inference as paddle_infer
+
+# 创建 config
+config = paddle_infer.Config()
+
+# 加载非Combined模型
+config.set_model("./mobilenet_v1")
+
+# 根据 config 创建 predictor
+predictor = paddle_infer.create_predictor(config)
+```
+
+代码示例 (2)：加载预测模型 - 非Combined模型
+
+```python
+# 引用 paddle inference 预测库
+import paddle.inference as paddle_infer
+
+# 创建 config
+config = paddle_infer.Config("./mobilenet_v1")
+
+# 根据 config 创建 predictor
+predictor = paddle_infer.create_predictor(config)
+```
+
+代码示例 (3)：加载预测模型 - Combined模型
+
+```python
+# 引用 paddle inference 预测库
+import paddle.inference as paddle_infer
+
+# 创建 config
+config = paddle_infer.Config("./mobilenet_v2/__model__", "./mobilenet_v2/__params__")
+
+# 根据 config 创建 predictor
+predictor = paddle_infer.create_predictor(config)
+```
diff --git a/docs/api_reference/python_api_doc/Config/GPUConfig.md b/docs/api_reference/python_api_doc/Config/GPUConfig.md
new file mode 100644
index 0000000000000..9497015496235
--- /dev/null
+++ b/docs/api_reference/python_api_doc/Config/GPUConfig.md
@@ -0,0 +1,209 @@
+# 使用 GPU 进行预测
+
+**注意：**
+1. Config 默认使用 CPU 进行预测，需要通过 `EnableUseGpu` 来启用 GPU 预测
+2. 可以尝试启用 CUDNN 和 TensorRT 进行 GPU 预测加速
+
+## GPU 设置
+
+API定义如下：
+
+```python
+# 启用 GPU 进行预测
+# 参数：memory_pool_init_size_mb - 初始化分配的gpu显存，以MB为单位
+#      device_id - 设备id
+# 返回：None
+paddle.inference.Config.enable_use_gpu(memory_pool_init_size_mb: int, device_id: int)
+
+# 禁用 GPU 进行预测
+# 参数：None
+# 返回：None
+paddle.inference.Config.disable_gpu()
+
+# 判断是否启用 GPU 
+# 参数：None
+# 返回：bool - 是否启用 GPU 
+paddle.inference.Config.use_gpu()
+
+# 获取 GPU 的device id
+# 参数：None
+# 返回：int -  GPU 的device id
+paddle.inference.Config.gpu_device_id()
+
+# 获取 GPU 的初始显存大小
+# 参数：None
+# 返回：int -  GPU 的初始的显存大小
+paddle.inference.Config.memory_pool_init_size_mb()
+
+# 初始化显存占总显存的百分比
+# 参数：None
+# 返回：float - 初始的显存占总显存的百分比
+paddle.inference.Config.fraction_of_gpu_memory_for_pool()
+```
+
+GPU设置代码示例：
+
+```python
+# 引用 paddle inference 预测库
+import paddle.inference as paddle_infer
+
+# 创建 config
+config = paddle_infer.Config("./mobilenet_v1")
+
+# 启用 GPU 进行预测 - 初始化 GPU 显存 100M, Deivce_ID 为 0
+config.enable_use_gpu(100, 0)
+# 通过 API 获取 GPU 信息
+print("Use GPU is: {}".format(config.use_gpu())) # True
+print("Init mem size is: {}".format(config.memory_pool_init_size_mb())) # 100
+print("Init mem frac is: {}".format(config.fraction_of_gpu_memory_for_pool())) # 0.003
+print("GPU device id is: {}".format(config.gpu_device_id())) # 0
+
+# 禁用 GPU 进行预测
+config.disable_gpu()
+# 通过 API 获取 GPU 信息
+print("Use GPU is: {}".format(config.use_gpu())) # False
+```
+
+## TensorRT 设置
+
+**注意：** 
+1. 启用 TensorRT 的前提为已经启用 GPU，否则启用 TensorRT 无法生效
+2. 对存在LoD信息的模型，如Bert, Ernie等NLP模型，必须使用动态 Shape
+3. 启用 TensorRT OSS 可以支持更多 plugin，详细参考 [TensorRT OSS](https://news.developer.nvidia.com/nvidia-open-sources-parsers-and-plugins-in-tensorrt/)
+
+更多 TensorRT 详细信息，请参考 [使用Paddle-TensorRT库预测](../../../optimize/paddle_trt)。
+
+API定义如下：
+
+```python
+# 启用 TensorRT 进行预测加速
+# 参数：workspace_size     - 指定 TensorRT 使用的工作空间大小
+#      max_batch_size     - 设置最大的 batch 大小，运行时 batch 大小不得超过此限定值
+#      min_subgraph_size  - Paddle-TRT 是以子图的形式运行，为了避免性能损失，当子图内部节点个数
+#                           大于 min_subgraph_size 的时候，才会使用 Paddle-TRT 运行
+#      precision          - 指定使用 TRT 的精度，支持 FP32(kFloat32)，FP16(kHalf)，Int8(kInt8)
+#      use_static         - 若指定为 true，在初次运行程序的时候会将 TRT 的优化信息进行序列化到磁盘上，
+#                           下次运行时直接加载优化的序列化信息而不需要重新生成
+#      use_calib_mode     - 若要运行 Paddle-TRT INT8 离线量化校准，需要将此选项设置为 true
+# 返回：None
+paddle.inference.Config.enable_tensorrt_engine(workspace_size: int = 1 << 20,
+                                               max_batch_size: int,
+                                               min_subgraph_size: int,
+                                               precision_mode: PrecisionType,
+                                               use_static: bool,
+                                               use_calib_mode: bool)
+
+# 判断是否启用 TensorRT 
+# 参数：None
+# 返回：bool - 是否启用 TensorRT
+paddle.inference.Config.tensorrt_engine_enabled()
+
+# 设置 TensorRT 的动态 Shape
+# 参数：min_input_shape          - TensorRT 子图支持动态 shape 的最小 shape
+#      max_input_shape          - TensorRT 子图支持动态 shape 的最大 shape
+#      optim_input_shape        - TensorRT 子图支持动态 shape 的最优 shape
+#      disable_trt_plugin_fp16  - 设置 TensorRT 的 plugin 不在 fp16 精度下运行
+# 返回：None
+paddle.inference.Config.set_trt_dynamic_shape_info(min_input_shape: Dict[str, List[int]]={}, 
+                                                   max_input_shape: Dict[str, List[int]]={}, 
+                                                   optim_input_shape: Dict[str, List[int]]={}, 
+                                                   disable_trt_plugin_fp16: bool=False)
+
+# 启用 TensorRT OSS 进行预测加速
+# 参数：None
+# 返回：None
+paddle.inference.Config.enable_tensorrt_oss()
+
+# 判断是否启用 TensorRT OSS
+# 参数：None
+# 返回：bool - 是否启用 TensorRT OSS
+paddle.inference.Config.tensorrt_oss_enabled()
+```
+
+代码示例 (1)：使用 TensorRT FP32 / FP16 / INT8 进行预测
+
+```python
+# 引用 paddle inference 预测库
+import paddle.inference as paddle_infer
+
+# 创建 config
+config = paddle_infer.Config("./mobilenet_v1")
+
+# 启用 GPU 进行预测 - 初始化 GPU 显存 100M, Deivce_ID 为 0
+config.enable_use_gpu(100, 0)
+
+# 启用 TensorRT 进行预测加速 - FP32
+config.enable_tensorrt_engine(workspace_size = 1 << 20, 
+                              max_batch_size = 1, 
+                              min_subgraph_size = 3, 
+                              precision_mode=paddle_infer.PrecisionType.Float32, 
+                              use_static = False, use_calib_mode = False)
+# 通过 API 获取 TensorRT 启用结果 - true
+print("Enable TensorRT is: {}".format(config.tensorrt_engine_enabled()))
+
+
+# 启用 TensorRT 进行预测加速 - FP16
+config.enable_tensorrt_engine(workspace_size = 1 << 20, 
+                              max_batch_size = 1, 
+                              min_subgraph_size = 3, 
+                              precision_mode=paddle_infer.PrecisionType.Half, 
+                              use_static = False, use_calib_mode = False)
+# 通过 API 获取 TensorRT 启用结果 - true
+print("Enable TensorRT is: {}".format(config.tensorrt_engine_enabled()))
+
+# 启用 TensorRT 进行预测加速 - Int8
+config.enable_tensorrt_engine(workspace_size = 1 << 20, 
+                              max_batch_size = 1, 
+                              min_subgraph_size = 3, 
+                              precision_mode=paddle_infer.PrecisionType.Int8, 
+                              use_static = False, use_calib_mode = False)
+# 通过 API 获取 TensorRT 启用结果 - true
+print("Enable TensorRT is: {}".format(config.tensorrt_engine_enabled()))
+```
+
+代码示例 (2)：使用 TensorRT 动态 Shape 进行预测
+
+```python
+# 引用 paddle inference 预测库
+import paddle.inference as paddle_infer
+
+# 创建 config
+config = paddle_infer.Config("./mobilenet_v1")
+
+# 启用 GPU 进行预测 - 初始化 GPU 显存 100M, Deivce_ID 为 0
+config.enable_use_gpu(100, 0)
+
+# 启用 TensorRT 进行预测加速 - Int8
+config.enable_tensorrt_engine(workspace_size = 1 << 30, 
+                              max_batch_size = 1, 
+                              min_subgraph_size = 1, 
+                              precision_mode=paddle_infer.PrecisionType.Int8, 
+                              use_static = False, use_calib_mode = True)
+
+# 设置 TensorRT 的动态 Shape
+config.set_trt_dynamic_shape_info(min_input_shape={"image": [1, 1, 3, 3]},
+                                  max_input_shape={"image": [1, 1, 10, 10]},
+                                  optim_input_shape={"image": [1, 1, 3, 3]})
+```
+
+代码示例 (3)：使用 TensorRT OSS 进行预测
+
+```python
+# 引用 paddle inference 预测库
+import paddle.inference as paddle_infer
+
+# 创建 config
+config = paddle_infer.Config("./mobilenet_v1")
+
+# 启用 GPU 进行预测 - 初始化 GPU 显存 100M, Deivce_ID 为 0
+config.enable_use_gpu(100, 0)
+
+# 启用 TensorRT 进行预测加速
+config.enable_tensorrt_engine()
+
+# 启用 TensorRT OSS 进行预测加速
+config.enable_tensorrt_oss()
+
+# 通过 API 获取 TensorRT OSS 启用结果 - true
+print("Enable TensorRT OSS is: {}".format(config.tensorrt_oss_enabled()))
+```
\ No newline at end of file
diff --git a/docs/api_reference/python_api_doc/Config/InternalUse.md b/docs/api_reference/python_api_doc/Config/InternalUse.md
new file mode 100644
index 0000000000000..78d17ffa5e6ee
--- /dev/null
+++ b/docs/api_reference/python_api_doc/Config/InternalUse.md
@@ -0,0 +1,54 @@
+# 仅供内部使用
+
+API定义如下：
+
+```python
+# 转化为 NativeConfig，不推荐使用
+# 参数：None
+# 返回：当前 Config 对应的 NativeConfig
+paddle.inference.Config.to_native_config()
+
+# 设置是否使用Feed, Fetch OP，仅内部使用
+# 当使用 ZeroCopyTensor 时，需设置为 false
+# 参数：x - 是否使用Feed, Fetch OP，默认为 true
+# 返回：None
+paddle.inference.Config.switch_use_feed_fetch_ops(x: bool = True)
+
+# 判断是否使用Feed, Fetch OP
+# 参数：None
+# 返回：bool - 是否使用Feed, Fetch OP
+paddle.inference.Config.use_feed_fetch_ops_enabled()
+
+# 设置是否需要指定输入 Tensor 的 Name，仅对内部 ZeroCopyTensor 有效
+# 参数：x - 是否指定输入 Tensor 的 Name，默认为 true
+# 返回：None
+paddle.inference.Config.switch_specify_input_names(x: bool = True)
+
+# 判断是否需要指定输入 Tensor 的 Name，仅对内部 ZeroCopyTensor 有效
+# 参数：None
+# 返回：bool - 是否需要指定输入 Tensor 的 Name
+paddle.inference.Config.specify_input_name()
+```
+
+代码示例：
+
+```python
+# 引用 paddle inference 预测库
+import paddle.inference as paddle_infer
+
+# 创建 config
+config = paddle_infer.Config("./mobilenet_v1")
+
+# 转化为 NativeConfig
+native_config = config.to_native_config()
+
+# 禁用 Feed, Fetch OP
+config.switch_use_feed_fetch_ops(False)
+# 返回是否使用 Feed, Fetch OP - false
+print("switch_use_feed_fetch_ops is: {}".format(config.use_feed_fetch_ops_enabled()))
+
+# 设置需要指定输入 Tensor 的 Name
+config.switch_specify_input_names(True)
+# 返回是否需要指定输入 Tensor 的 Name - true
+print("specify_input_name is: {}".format(config.specify_input_name()))
+```
\ No newline at end of file
diff --git a/docs/api_reference/python_api_doc/Config/ModelConfig.md b/docs/api_reference/python_api_doc/Config/ModelConfig.md
new file mode 100644
index 0000000000000..e1418e8bc57e5
--- /dev/null
+++ b/docs/api_reference/python_api_doc/Config/ModelConfig.md
@@ -0,0 +1,135 @@
+# 设置预测模型
+
+## 从文件中加载预测模型 - 非Combined模型 
+
+API定义如下：
+
+```python
+# 设置模型文件路径，当需要从磁盘加载非 Combined 模型时使用
+# 参数：model_dir - 模型文件夹路径 - str 类型
+# 返回：None
+paddle.inference.Config.set_model(model_dir: str)
+
+# 获取非combine模型的文件夹路径
+# 参数：None
+# 返回：str - 模型文件夹路径
+paddle.inference.Config.model_dir()
+```
+
+代码示例：
+
+```python
+# 引用 paddle inference 预测库
+import paddle.inference as paddle_infer
+
+# 创建 config
+config = paddle_infer.Config()
+
+# 设置非combine模型的文件夹路径
+config.set_model("./mobilenet_v1")
+
+# 获取非combine模型的文件夹路径
+print(config.model_dir())
+
+# 根据 config 创建 predictor
+predictor = paddle_infer.create_predictor(config)
+```
+
+## 从文件中加载预测模型 -  Combined 模型
+
+API定义如下：
+
+```python
+# 设置模型文件路径，当需要从磁盘加载 Combined 模型时使用
+# 参数：prog_file_path - 模型文件路径
+#      params_file_path - 参数文件路径
+# 返回：None
+paddle.inference.Config.set_model(prog_file_path: str, params_file_path: str)
+
+# 设置模型文件路径，当需要从磁盘加载 Combined 模型时使用。
+# 参数：x - 模型文件路径
+# 返回：None
+paddle.inference.Config.set_prog_file(x: str)
+
+# 设置参数文件路径，当需要从磁盘加载 Combined 模型时使用
+# 参数：x - 参数文件路径
+# 返回：None
+paddle.inference.Config.set_params_file(x: str)
+
+# 获取 Combined 模型的模型文件路径
+# 参数：None
+# 返回：str - 模型文件路径
+paddle.inference.Config.prog_file()
+
+# 获取 Combined 模型的参数文件路径
+# 参数：None
+# 返回：str - 参数文件路径
+paddle.inference.Config.params_file()
+```
+
+代码示例：
+
+```python
+# 引用 paddle inference 预测库
+import paddle.inference as paddle_infer
+
+# 创建 config
+config = paddle_infer.Config()
+
+# 通过 API 设置模型文件夹路径
+config.set_prog_file("./mobilenet_v2/__model__")
+config.set_params_file("./mobilenet_v2/__params__")
+
+# 通过 API 获取 config 中的模型文件和参数文件路径
+print(config.prog_file())
+print(config.params_file())
+
+# 根据 config 创建 predictor
+predictor = paddle_infer.create_predictor(config)
+```
+
+## 从内存中加载预测模型
+
+API定义如下：
+
+```python
+# 从内存加载模型
+# 参数：prog_buffer - 内存中模型结构数据
+#      prog_buffer_size - 内存中模型结构数据的大小
+#      params_buffer - 内存中模型参数数据
+#      params_buffer_size - 内存中模型参数数据的大小
+# 返回：None
+paddle.inference.Config.set_model_buffer(prog_buffer: str, prog_buffer_size: int, 
+                                         params_buffer: str, params_buffer_size: int)
+
+# 判断是否从内存中加载模型
+# 参数：None
+# 返回：bool - 是否从内存中加载模型
+paddle.inference.Config.model_from_memory()
+```
+
+代码示例：
+
+```python
+# 引用 paddle inference 预测库
+import paddle.inference as paddle_infer
+
+# 创建 config
+config = paddle_infer.Config()
+
+# 加载模型文件到内存
+with open('./mobilenet_v2/__model__', 'rb') as prog_file:
+    prog_data=prog_file.read()
+    
+with open('./mobilenet_v2/__params__', 'rb') as params_file:
+    params_data=params_file.read()
+
+# 从内存中加载模型
+config.set_model_buffer(prog_data, len(prog_data), params_data, len(params_data))
+
+# 通过 API 获取 config 中 model_from_memory 的值 - True
+print(config.model_from_memory())
+
+# 根据 config 创建 predictor
+predictor = paddle_infer.create_predictor(config)
+```
diff --git a/docs/api_reference/python_api_doc/Config/OptimConfig.md b/docs/api_reference/python_api_doc/Config/OptimConfig.md
new file mode 100644
index 0000000000000..8817c8d1df90b
--- /dev/null
+++ b/docs/api_reference/python_api_doc/Config/OptimConfig.md
@@ -0,0 +1,124 @@
+# 设置模型优化方法
+
+## IR 优化
+
+API定义如下：
+
+```python
+# 启用 IR 优化
+# 参数：x - 是否开启 IR 优化，默认打开
+# 返回：None
+paddle.inference.Config.switch_ir_optim(x: bool = True)
+
+# 判断是否开启 IR 优化 
+# 参数：None
+# 返回：bool - 是否开启 IR 优化
+paddle.inference.Config.ir_optim()
+
+# 设置是否在图分析阶段打印 IR，启用后会在每一个 PASS 后生成 dot 文件
+# 参数：x - 是否打印 IR，默认打开
+# 返回：None
+paddle.inference.Config.switch_ir_debug(x: int=True)
+
+# 返回 pass_builder，用来自定义图分析阶段选择的 IR
+# 参数：None
+# 返回：PassStrategy - pass_builder对象
+paddle.inference.Config.pass_builder()
+
+# 删除字符串匹配为 pass 的 pass
+# 参数：pass - 需要删除的 pass 字符串
+# 返回：None
+paddle.inference.Config.delete_pass(pass: str)
+```
+
+代码示例：
+
+```python
+# 引用 paddle inference 预测库
+import paddle.inference as paddle_infer
+
+# 创建 config
+config = paddle_infer.Config("./mobilenet_v1")
+
+# 开启 IR 优化
+config.switch_ir_optim()
+# 开启 IR 打印
+config.switch_ir_debug()
+
+# 得到 pass_builder 对象
+pass_builder = config.pass_builder()
+
+# 或者直接通过 config 去除 fc_fuse_pass
+config.delete_pass("fc_fuse_pass")
+
+# 通过 API 获取 IR 优化是否开启 - true
+print("IR Optim is: {}".format(config.ir_optim()))
+
+# 根据 config 创建 predictor
+predictor = paddle_infer.create_predictor(config)
+
+
+```
+
+运行结果示例：
+
+```bash
+# switch_ir_optim 开启 IR 优化后，运行中会有如下 LOG 输出
+--- Running analysis [ir_graph_build_pass]
+--- Running analysis [ir_graph_clean_pass]
+--- Running analysis [ir_analysis_pass]
+--- Running IR pass [simplify_with_basic_ops_pass]
+--- Running IR pass [attention_lstm_fuse_pass]
+--- Running IR pass [seqconv_eltadd_relu_fuse_pass]
+...
+--- Running analysis [inference_op_replace_pass]
+--- Running analysis [ir_graph_to_program_pass]
+
+# switch_ir_debug 开启 IR 打印后，运行结束之后会在目录下生成如下 DOT 文件
+-rw-r--r-- 1 root root  70K Nov 17 10:47 0_ir_simplify_with_basic_ops_pass.dot
+-rw-r--r-- 1 root root  72K Nov 17 10:47 10_ir_fc_gru_fuse_pass.dot
+-rw-r--r-- 1 root root  72K Nov 17 10:47 11_ir_graph_viz_pass.dot
+...
+-rw-r--r-- 1 root root  72K Nov 17 10:47 8_ir_mul_lstm_fuse_pass.dot
+-rw-r--r-- 1 root root  72K Nov 17 10:47 9_ir_graph_viz_pass.dot
+```
+
+## Lite 子图
+
+```python 
+# 启用 Lite 子图
+# 参数：precision_mode - Lite 子图的运行精度，默认为 FP32
+#      zero_copy      - 启用 zero_copy，lite 子图与 paddle inference 之间共享数据
+#      Passes_filter  - 设置 lite 子图的 pass
+#      ops_filter     - 设置不使用 lite 子图运行的 op
+# 返回：None
+paddle.inference.Config.enable_lite_engine(precision_mode: PrecisionType = paddle_infer.PrecisionType.Float32, 
+                                           zero_copy: bool = False, 
+                                           passes_filter: List[str]=[], 
+                                           ops_filter: List[str]=[])
+
+
+# 判断是否启用 Lite 子图
+# 参数：None
+# 返回：bool - 是否启用 Lite 子图
+paddle.inference.Config.lite_engine_enabled()
+```
+
+示例代码：
+
+```python
+# 引用 paddle inference 预测库
+import paddle.inference as paddle_infer
+
+# 创建 config
+config = paddle_infer.Config("./mobilenet_v1")
+
+# 启用 GPU 进行预测
+config.enable_use_gpu(100, 0)
+
+# 启用 Lite 子图
+config.enable_lite_engine(paddle_infer.PrecisionType.Float32)
+
+# 通过 API 获取 Lite 子图启用信息 - true
+print("Lite Engine is: {}".format(config.lite_engine_enabled()))
+```
\ No newline at end of file
diff --git a/docs/api_reference/python_api_doc/Config/OtherFunction.md b/docs/api_reference/python_api_doc/Config/OtherFunction.md
new file mode 100644
index 0000000000000..464897e9cb8b4
--- /dev/null
+++ b/docs/api_reference/python_api_doc/Config/OtherFunction.md
@@ -0,0 +1,144 @@
+# 启用内存优化
+
+API定义如下：
+
+```python
+# 开启内/显存复用，具体降低内存效果取决于模型结构。
+# 参数：None
+# 返回：None
+paddle.inference.Config.enable_memory_optim()
+```
+
+代码示例：
+
+```python
+# 引用 paddle inference 预测库
+import paddle.inference as paddle_infer
+
+# 创建 config
+config = paddle_infer.Config("./mobilenet_v1")
+
+# 开启 CPU 显存优化
+config.enable_memory_optim()
+
+# 启用 GPU 进行预测
+config.enable_use_gpu(100, 0)
+# 开启 GPU 显存优化
+config.enable_memory_optim()
+```
+
+# 设置缓存路径
+
+**注意：** 如果当前使用的为 TensorRT INT8 且设置从内存中加载模型，则必须通过 `SetOptimCacheDir` 来设置缓存路径。
+
+API定义如下：
+
+```python
+# 设置缓存路径
+# 参数：opt_cache_dir - 缓存路径
+# 返回：None
+paddle.inference.Config.set_optim_cache_dir(opt_cache_dir: str)
+```
+
+代码示例：
+
+```python
+# 引用 paddle inference 预测库
+import paddle.inference as paddle_infer
+
+# 创建 config
+config = paddle_infer.Config("./mobilenet_v1")
+
+# 设置缓存路径
+config.set_optim_cache_dir("./OptimCacheDir")
+```
+
+# Profile 设置
+
+API定义如下：
+
+```python
+# 打开 Profile，运行结束后会打印所有 OP 的耗时占比。
+# 参数：None
+# 返回：None
+paddle.inference.Config.enable_profile()
+```
+
+代码示例：
+
+```python
+# 引用 paddle inference 预测库
+import paddle.inference as paddle_infer
+
+# 创建 config
+config = paddle_infer.Config("./mobilenet_v1")
+
+# 打开 Profile
+config.enable_profile()
+```
+
+执行预测之后输出的 Profile 的结果如下：
+
+```bash
+------------------------->     Profiling Report     <-------------------------
+
+Place: CPU
+Time unit: ms
+Sorted by total time in descending order in the same thread
+
+-------------------------     Overhead Summary      -------------------------
+
+Total time: 1085.33
+  Computation time       Total: 1066.24     Ratio: 98.2411%
+  Framework overhead     Total: 19.0902     Ratio: 1.75893%
+
+-------------------------     GpuMemCpy Summary     -------------------------
+
+GpuMemcpy                Calls: 0           Total: 0           Ratio: 0%
+
+-------------------------       Event Summary       -------------------------
+
+Event                            Calls       Total       Min.        Max.        Ave.        Ratio.
+thread0::conv2d                  210         319.734     0.815591    6.51648     1.52254     0.294595
+thread0::load                    137         284.596     0.114216    258.715     2.07735     0.26222
+thread0::depthwise_conv2d        195         266.241     0.955945    2.47858     1.36534     0.245308
+thread0::elementwise_add         210         122.969     0.133106    2.15806     0.585568    0.113301
+thread0::relu                    405         56.1807     0.021081    0.585079    0.138718    0.0517635
+thread0::batch_norm              195         25.8073     0.044304    0.33896     0.132345    0.0237783
+thread0::fc                      15          7.13856     0.451674    0.714895    0.475904    0.0065773
+thread0::pool2d                  15          1.48296     0.09054     0.145702    0.0988637   0.00136636
+thread0::softmax                 15          0.941837    0.032175    0.460156    0.0627891   0.000867786
+thread0::scale                   15          0.240771    0.013394    0.030727    0.0160514   0.000221841
+```
+
+# Log 设置
+
+API定义如下：
+
+```python
+# 去除 Paddle Inference 运行中的 LOG
+# 参数：None
+# 返回：None
+paddle.inference.Config.disable_glog_info()
+
+# 判断是否禁用 LOG
+# 参数：None
+# 返回：bool - 是否禁用 LOG
+paddle.inference.Config.glog_info_disabled()
+```
+
+代码示例：
+
+```python
+# 引用 paddle inference 预测库
+import paddle.inference as paddle_infer
+
+# 创建 config
+config = paddle_infer.Config("./mobilenet_v1")
+
+# 去除 Paddle Inference 运行中的 LOG
+config.disable_glog_info()
+
+# 判断是否禁用 LOG - true
+print("GLOG INFO is: {}".format(config.glog_info_disabled()))
+```
\ No newline at end of file
diff --git a/docs/api_reference/python_api_doc/Config/XPUConfig.md b/docs/api_reference/python_api_doc/Config/XPUConfig.md
new file mode 100644
index 0000000000000..4a836df63c8b0
--- /dev/null
+++ b/docs/api_reference/python_api_doc/Config/XPUConfig.md
@@ -0,0 +1,24 @@
+
+# 使用 XPU 进行预测
+
+API定义如下：
+
+```python
+# 启用 XPU 进行预测
+# 参数：l3_workspace_size - l3 cache 分配的显存大小
+# 返回：None
+paddle.inference.Config.enable_xpu(l3_workspace_size: int = 0xfffc00)
+```
+
+代码示例：
+
+```python
+# 引用 paddle inference 预测库
+import paddle.inference as paddle_infer
+
+# 创建 config
+config = paddle_infer.Config("./mobilenet_v1")
+
+# 启用 XPU，并设置 l3 cache 大小为 100M
+config.enable_xpu(100)
+```
diff --git a/docs/api_reference/python_api_doc/Config_index.rst b/docs/api_reference/python_api_doc/Config_index.rst
new file mode 100644
index 0000000000000..977945ad8ed91
--- /dev/null
+++ b/docs/api_reference/python_api_doc/Config_index.rst
@@ -0,0 +1,15 @@
+Config 类
+================
+
+.. toctree::
+    :maxdepth: 2
+    :numbered: 3
+
+    Config/ConfigClass
+    Config/ModelConfig
+    Config/CPUConfig
+    Config/GPUConfig
+    Config/XPUConfig
+    Config/OptimConfig
+    Config/OtherFunction
+    Config/InternalUse
\ No newline at end of file
diff --git a/docs/api_reference/python_api_doc/Enum.md b/docs/api_reference/python_api_doc/Enum.md
new file mode 100644
index 0000000000000..c59d1184b88a9
--- /dev/null
+++ b/docs/api_reference/python_api_doc/Enum.md
@@ -0,0 +1,66 @@
+# 枚举类型
+
+## DataType
+
+`DataType`定义了`Tensor`的数据类型，由传入`Tensor`的numpy数组类型确定。
+
+```python
+# DataType 枚举定义
+class paddle.inference.DataType:
+
+# 获取各个 DataType 对应的字节数
+# 参数：dtype - DataType 枚举
+# 输出：dtype 对应的字节数
+paddle.inference.get_num_bytes_of_data_type(dtype: DataType)
+```
+
+DataType 中包括以下成员:
+
+* `INT64`: 64位整型
+* `INT32`: 32位整型
+* `FLOAT32`: 32位浮点型
+
+代码示例：
+
+```python
+# 引用 paddle inference 预测库
+import paddle.inference as paddle_infer
+
+# 创建 FLOAT32 类型 DataType
+data_type = paddle_infer.DataType.FLOAT32
+
+# 输出 data_type 的字节数 - 4
+paddle_infer.get_num_bytes_of_data_type(data_type)
+```
+
+## PrecisionType
+
+PrecisionType设置模型的运行精度，默认值为 `kFloat32(float32)`。枚举变量定义如下：
+
+```python
+# PrecisionType 枚举定义
+class paddle.inference.PrecisionType
+```
+
+PrecisionType 中包括以下成员:
+
+* `Float32`: FP32 模式运行
+* `Half`: FP16 模式运行
+* `Int8`: INT8 模式运行
+
+代码示例：
+
+```python
+# 引用 paddle inference 预测库
+import paddle.inference as paddle_infer
+
+# 创建 config
+config = paddle_infer.Config("./mobilenet_v1")
+
+# 启用 GPU, 初始化100M显存，使用gpu id为0
+config.enable_use_gpu(100, 0)
+
+# 开启 TensorRT 预测，精度为 FP32，开启 INT8 离线量化校准
+config.enable_tensorrt_engine(precision_mode=paddle_infer.PrecisionType.Float32,
+                              use_calib_mode=True)
+```
\ No newline at end of file
diff --git a/docs/api_reference/python_api_doc/Predictor.md b/docs/api_reference/python_api_doc/Predictor.md
new file mode 100644
index 0000000000000..1b0cd80e68fa3
--- /dev/null
+++ b/docs/api_reference/python_api_doc/Predictor.md
@@ -0,0 +1,86 @@
+# Predictor 类
+
+Paddle Inference的预测器，由 `create_predictor` 根据 `Config` 进行创建。用户可以根据Predictor提供的接口设置输入数据、执行模型预测、获取输出等。
+
+类及方法定义如下：
+
+```python
+# Predictor 类定义
+class paddle.inference.Predictor
+
+# 获取所有输入 Tensor 的名称
+# 参数：None
+# 返回：List[str] - 所有输入 Tensor 的名称
+paddle.inference.Predictor.get_input_names()
+
+# 根据名称获取输入 Tensor 的句柄
+# 参数：name - Tensor 的名称
+# 返回：Tensor - 输入 Tensor
+paddle.inference.Predictor.get_input_handle(name: str)
+
+# 获取所有输出 Tensor 的名称
+# 参数：None
+# 返回：List[str] - 所有输出 Tensor 的名称
+paddle.inference.Predictor.get_output_names()
+
+# 根据名称获取输出 Tensor 的句柄
+# 参数：name - Tensor 的名称
+# 返回：Tensor - 输出 Tensor
+paddle.inference.Predictor.get_output_handle(name: str)
+
+# 执行模型预测，需要在设置输入数据后调用
+# 参数：None
+# 返回：None
+paddle.inference.Predictor.run()
+
+# 根据该 Predictor，克隆一个新的 Predictor，两个 Predictor 之间共享权重
+# 参数：None
+# 返回：Predictor - 新的 Predictor
+paddle.inference.Predictor.clone()
+
+# 释放中间 Tensor
+# 参数：None
+# 返回：None
+paddle.inference.Predictor.clear_intermediate_tensor()
+
+# 释放内存池中的所有临时 Tensor
+# 参数：None
+# 返回：int - 释放的内存字节数
+paddle.inference.Predictor.try_shrink_memory()
+```
+
+代码示例
+
+```python
+import numpy
+
+# 引用 paddle inference 预测库
+import paddle.inference as paddle_infer
+
+# 创建 config
+config = paddle_infer.Config("./mobilenet_v1")
+
+# 根据 config 创建 predictor
+predictor = paddle_infer.create_predictor(config)
+
+# 获取输入 Tensor
+input_names = predictor.get_input_names()
+input_tensor = predictor.get_input_handle(input_names[0])
+
+# 从 CPU 获取数据，设置到 Tensor 内部
+fake_input = numpy.random.randn(1, 3, 224, 224).astype("float32")
+input_tensor.copy_from_cpu(fake_input)
+
+# 执行预测
+predictor.run()
+
+# 获取输出 Tensor
+output_names = predictor.get_output_names()
+output_tensor = predictor.get_output_handle(output_names[0])
+
+# 释放中间Tensor
+predictor.clear_intermediate_tensor()
+
+# 释放内存池中的所有临时 Tensor
+predictor.try_shrink_memory()
+```
diff --git a/docs/api_reference/python_api_doc/PredictorPool.md b/docs/api_reference/python_api_doc/PredictorPool.md
new file mode 100644
index 0000000000000..71413ff42a925
--- /dev/null
+++ b/docs/api_reference/python_api_doc/PredictorPool.md
@@ -0,0 +1,33 @@
+#  PredictorPool 类
+
+`PredictorPool` 对 `Predictor` 进行了简单的封装，通过传入config和thread的数目来完成初始化，在每个线程中，根据自己的线程id直接从池中取出对应的 `Predictor` 来完成预测过程。
+
+类及方法定义如下：
+
+```python
+# PredictorPool 类定义
+# 参数：config - Config 类型
+#      size - Predictor 对象数量
+class paddle.inference.PredictorPool(config: Config, size: int)
+
+# 根据线程 ID 取出该线程对应的 Predictor
+# 参数：idx - 线程 ID
+# 返回：Predictor - 线程 ID 对应的 Predictor
+paddle.inference.PredictorPool.retrive(idx: int)
+```
+
+代码示例
+
+```python
+# 引用 paddle inference 预测库
+import paddle.inference as paddle_infer
+
+# 创建 Config
+config = paddle_infer.Config("./mobilenet_v1")
+
+# 创建 PredictorPool
+pred_pool = paddle_infer.PredictorPool(config, 4)
+
+# 获取 ID 为 2 的 Predictor 对象
+predictor = pred_pool.retrive(2)
+```
\ No newline at end of file
diff --git a/docs/api_reference/python_api_doc/Tensor.md b/docs/api_reference/python_api_doc/Tensor.md
new file mode 100644
index 0000000000000..ae90b9f09fbde
--- /dev/null
+++ b/docs/api_reference/python_api_doc/Tensor.md
@@ -0,0 +1,91 @@
+#  Tensor 类
+
+Tensor是Paddle Inference的数据组织形式，用于对底层数据进行封装并提供接口对数据进行操作，包括设置Shape、数据、LoD信息等。
+
+**注意：** 应使用 `Predictor` 的 `get_input_handle` 和 `get_output_handle` 接口获取输入输出 `Tensor`。
+
+类及方法定义如下：
+
+```python
+# Tensor 类定义
+class paddle.inference.Tensor
+
+# 设置 Tensor 的维度信息
+# 参数：shape - 维度信息
+# 返回：None
+paddle.inference.Tensor.reshape(shape: numpy.ndarray|List[int])
+
+# 从 CPU 获取数据，设置到 Tensor 内部
+# 参数：data - CPU 数据 - 支持float, int32, int64
+# 返回：None
+paddle.inference.Tensor.copy_from_cpu(data: numpy.ndarray)
+
+# 从 Tensor 中获取数据到 CPU
+# 参数：None
+# 返回：numpy.ndarray - CPU 数据
+paddle.inference.Tensor.copy_to_cpu()
+
+# 获取 Tensor 的维度信息
+# 参数：None
+# 返回：List[int] - Tensor 的维度信息
+paddle.inference.Tensor.shape()
+
+# 设置 Tensor 的 LoD 信息
+# 参数：x - Tensor 的 LoD 信息
+# 返回：None
+paddle.inference.Tensor.set_lod(x: numpy.ndarray|List[List[int]])
+
+# 获取 Tensor 的 LoD 信息
+# 参数：None
+# 返回：List[List[int]] - Tensor 的 LoD 信息
+paddle.inference.Tensor.lod()
+
+# 获取 Tensor 的数据类型
+# 参数：None
+# 返回：DataType - Tensor 的数据类型
+paddle.inference.Tensor.type()
+```
+
+代码示例：
+
+```python
+import numpy
+
+# 引用 paddle inference 预测库
+import paddle.inference as paddle_infer
+
+# 创建 config
+config = paddle_infer.Config("./mobilenet_v1")
+
+# 根据 config 创建 predictor
+predictor = paddle_infer.create_predictor(config)
+
+# 准备输入数据
+fake_input = numpy.random.randn(1, 3, 224, 224).astype("float32")
+
+# 获取输入 Tensor
+input_names = predictor.get_input_names()
+input_tensor = predictor.get_input_handle(input_names[0])
+
+# 设置 Tensor 的维度信息
+input_tensor.reshape([1, 3, 224, 224])
+
+# 从 CPU 获取数据，设置到 Tensor 内部
+input_tensor.copy_from_cpu(fake_input)
+
+# 执行预测
+predictor.run()
+
+# 获取输出 Tensor
+output_names = predictor.get_output_names()
+output_tensor = predictor.get_output_handle(output_names[0])
+
+# 从 Tensor 中获取数据到 CPU
+output_data = output_tensor.copy_to_cpu()
+
+# 获取 Tensor 的维度信息
+output_shape = output_tensor.shape()
+
+# 获取 Tensor 的数据类型
+output_type = output_tensor.type()
+```
\ No newline at end of file
diff --git a/docs/api_reference/python_api_doc/create_predictor.md b/docs/api_reference/python_api_doc/create_predictor.md
new file mode 100644
index 0000000000000..d63d1b0834970
--- /dev/null
+++ b/docs/api_reference/python_api_doc/create_predictor.md
@@ -0,0 +1,49 @@
+# create_predictor 方法
+
+API定义如下：
+
+```python
+# 根据 Config 构建预测执行器 Predictor
+# 参数: config - 用于构建 Predictor 的配置信息
+# 返回: Predictor - 预测执行器
+paddle.inference.create_predictor(config: Config)
+```
+
+代码示例:
+
+```python
+# 引用 paddle inference 预测库
+import paddle.inference as paddle_infer
+
+# 创建 config
+config = paddle_infer.Config("./mobilenet_v1")
+
+# 根据 config 创建 predictor
+predictor = paddle_infer.create_predictor(config)
+```
+
+# get_version 方法
+
+API定义如下：
+
+```python
+# 获取 Paddle 版本信息
+# 参数: NONE
+# 返回: str - Paddle 版本信息
+paddle.inference.get_version()
+```
+
+代码示例:
+
+```python
+# 引用 paddle inference 预测库
+import paddle.inference as paddle_infer
+
+# 获取 Paddle 版本信息
+paddle_infer.get_version()
+
+# 获得输出如下:
+# version: 2.0.0-rc0
+# commit: 97227e6
+# branch: HEAD
+```
\ No newline at end of file
diff --git a/docs/api_reference/python_api_index.rst b/docs/api_reference/python_api_index.rst
new file mode 100644
index 0000000000000..4f76e784919f5
--- /dev/null
+++ b/docs/api_reference/python_api_index.rst
@@ -0,0 +1,12 @@
+Python API 文档
+=======================
+
+.. toctree::
+    :maxdepth: 3
+
+    python_api_doc/create_predictor
+    python_api_doc/Config_index.rst
+    python_api_doc/Predictor
+    python_api_doc/PredictorPool
+    python_api_doc/Tensor
+    python_api_doc/Enum
\ No newline at end of file
diff --git a/docs/conf.py b/docs/conf.py
index b1ea576299a44..323c8d2fec832 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -88,6 +88,12 @@
 # so a file named "default.css" will overwrite the builtin "default.css".
 html_static_path = ['_static']
 
+html_context = {
+    'css_files': [
+        '_static/theme_overrides.css',  # enable word_warp in code
+        ],
+     }
+
 # Custom sidebar templates, must be a dictionary that maps document names
 # to template names.
 #
@@ -172,46 +178,3 @@
 
 # A list of files that should not be packed into the epub file.
 epub_exclude_files = ['search.html']
-
-extensions = [
-    # there may be others here already, e.g. 'sphinx.ext.mathjax'
-    'breathe',
-    'exhale'
-]
-
-# Setup the breathe extension
-breathe_projects = {
-    "My Project": "./doxyoutput/xml"
-}
-breathe_default_project = "My Project"
-
-# Setup the exhale extension
-exhale_args = {
-    # These arguments are required
-    "containmentFolder":     "./api",
-    "rootFileName":          "library_root.rst",
-    "rootFileTitle":         "Library API",
-    "doxygenStripFromPath":  "..",
-    # Suggested optional arguments
-    "createTreeView":        True,
-    # TIP: if using the sphinx-bootstrap-theme, you need
-    # "treeViewIsBootstrap": True,
-    "exhaleExecutesDoxygen": True,
-    "exhaleDoxygenStdin":    "INPUT = paddle_include_file"
-}
-
-# Tell sphinx what the primary language being documented is.
-primary_domain = 'cpp'
-
-# Tell sphinx what the pygments highlight language should be.
-highlight_language = 'cpp'
-
-import os
-
-on_rtd = os.environ.get('READTHEDOCS', None) == 'True'
-
-if not on_rtd:  # only import and set the theme if we're building docs locally
-    import sphinx_rtd_theme
-    html_theme = 'sphinx_rtd_theme'
-    #html_theme = "alabaster"
-    html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
diff --git a/docs/index.rst b/docs/index.rst
index 30af359119269..3b1f4e1a44c4f 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -45,12 +45,14 @@ Welcome to Paddle-Inference's documentation!
   :name: sec-benchmark
   
   benchmark/benchmark
-  
+
 .. toctree::
-  :maxdepth: 2
-  :caption: API文档
+  :maxdepth: 1
+  :caption: API 文档
+  :name: sec-api-reference
 
-  api/library_root
+  api_reference/cxx_api_index
+  api_reference/python_api_index
 
 .. toctree::
   :maxdepth: 1
diff --git a/docs/requirements.txt b/docs/requirements.txt
index 48b25191d7990..d3bc7dd101b9b 100644
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -1,6 +1,4 @@
-breathe==4.18.1
 sphinx==3.0.3
 recommonmark
 sphinx_markdown_tables==0.0.14
 sphinx_rtd_theme==0.4.3
-exhale==0.2.3