deepmodeling · AnguseZhang · Jul 5, 2022 · Jun 20, 2022 · Jun 20, 2022 · Jun 20, 2022
diff --git a/doc/index.rst b/doc/index.rst
@@ -34,6 +34,8 @@ DPGEN's documentation
 
    init/init-bulk-mdata
    init/init-surf-mdata
+   init/init-reaction
+   init/init-reaction-jdata
    init/init-reaction-mdata
 
 .. _autotest::

diff --git a/doc/init/init-reaction-jdata.rst b/doc/init/init-reaction-jdata.rst
@@ -0,0 +1,6 @@
+dpgen init_reaction parameters
+======================================
+
+.. dargs::
+   :module: dpgen.data.arginfo
+   :func: init_reaction_jdata_arginfo
diff --git a/doc/init/init-reaction.md b/doc/init/init-reaction.md
@@ -0,0 +1,21 @@
+# init_reaction
+
+`dpgen init_reaction` is a workflow to initilize data for reactive systems of small gas-phase molecules. The workflow was introduced in the "Initialization" section of [Energy & Fuels, 2021, 35 (1), 762–769](https://10.1021/acs.energyfuels.0c03211).
+
+To start the workflow, one needs a box containing reactive systems. The following packages are required for each of the step:
+- Exploring: [LAMMPS](https://github.com/lammps/lammps)
+- Sampling: [MDDatasetBuilder](https://github.com/tongzhugroup/mddatasetbuilder)
+- Labeling: [Gaussian](https://gaussian.com/)
+
+The Exploring step uses LAMMPS [pair_style reaxff](https://docs.lammps.org/latest/pair_reaxff.html) to run a short ReaxMD NVT MD simulation. In the Sampling step, molecular clusters are taken and k-means clustering algorithm is applied to remove the redundancy, which is described in [Nature Communications, 11, 5713 (2020)](https://doi.org/10.1038/s41467-020-19497-z). The Labeling step calculates energies and forces using the Gaussian package.
+
+An example of `reaction.json` is given below:
+
+```{literalinclude} ../../examples/init/reaction.json
+:language: json
+:linenos:
+```
+
+For detailed parameters, see [parametes](init-reaction-jdata.rst) and [machine parameters](init-reaction-mdata.rst).
+
+The genereated data can be used to continue DP-GEN concurrent learning workflow. Read [Energy & Fuels, 2021, 35 (1), 762–769](https://10.1021/acs.energyfuels.0c03211) for details.
diff --git a/dpgen/data/arginfo.py b/dpgen/data/arginfo.py
@@ -1,4 +1,4 @@
-from dargs import Argument
+from dargs import Argument, ArgumentEncoder
 
 from dpgen.arginfo import general_mdata_arginfo
 
@@ -34,3 +34,44 @@ def init_reaction_mdata_arginfo() -> Argument:
         arginfo
     """
     return general_mdata_arginfo("init_reaction_mdata", ("reaxff", "build", "fp"))
+
+
+def init_reaction_jdata_arginfo() -> Argument:
+    """Generate arginfo for dpgen init_reaction jdata.
+
+    Returns
+    -------
+    Argument
+        dpgen init_reaction jdata arginfo
+    """
+    doc_init_reaction = "Generate initial data for reactive systems for small gas-phase molecules, from a ReaxFF NVT MD trajectory."
+    doc_type_map = "Type map, which should match types in the initial data. e.g. [\"C\", \"H\", \"O\"]"
+    doc_reaxff = "Parameters for ReaxFF NVT MD."
+    doc_data = "Path to initial LAMMPS data file. The atom_style should be charge."
+    doc_ff = "Path to ReaxFF force field file. Available in the lammps/potentials directory."
+    doc_control = "Path to ReaxFF control file."
+    doc_temp = "Target Temperature for the NVT MD simulation. Unit: K."
+    doc_dt = "Real time for every time step. Unit: fs."
+    doc_tau_t = "Time to determine how rapidly the temperature. Unit: fs."
+    doc_dump_frep = "Frequency of time steps to collect trajectory."
+    doc_nstep = "Total steps to run the ReaxFF MD simulation."
+    doc_cutoff = "Cutoff radius to take clusters from the trajectory. Note that only a complete molecule or free radical will be taken."
+    doc_dataset_size = "Collected dataset size for each bond type."
+    doc_qmkeywords = "Gaussian keywords for first-principle calculations. e.g. force mn15/6-31g** Geom=PrintInputOrient. Note that \"force\" job is necessary to collect data. Geom=PrintInputOrient should be used when there are more than 50 atoms in a cluster."
+
+    return Argument("init_reaction_jdata", dict, [
+        Argument("type_map", list, doc=doc_type_map),
+        Argument("reaxff", dict, [
+            Argument("data", str, doc=doc_data),
+            Argument("ff", str, doc=doc_ff),
+            Argument("control", str, doc=doc_control),
+            Argument("temp", [float, int], doc=doc_temp),
+            Argument("dt", [float, int], doc=doc_dt),
+            Argument("tau_t", [float, int], doc=doc_tau_t),
+            Argument("dump_freq", int, doc=doc_dump_frep),
+            Argument("nstep", int, doc=doc_nstep),
+        ], doc=doc_reaxff),
+        Argument("cutoff", float, doc=doc_cutoff),
+        Argument("dataset_size", int, doc=doc_dataset_size),
+        Argument("qmkeywords", str, doc=doc_qmkeywords),
+    ], doc=doc_init_reaction)
diff --git a/dpgen/data/reaction.py b/dpgen/data/reaction.py
@@ -18,7 +18,8 @@
 from dpgen.dispatcher.Dispatcher import make_submission_compat
 from dpgen.remote.decide_machine import convert_mdata
 from dpgen.generator.run import create_path, make_fp_task_name
-from dpgen.util import sepline
+from dpgen.util import sepline, normalize
+from .arginfo import init_reaction_jdata_arginfo
 
 reaxff_path = "00.reaxff"
 build_path = "01.build"
@@ -207,6 +208,9 @@ def gen_init_reaction(args):
             with open(args.MACHINE, "r") as fp:
                 mdata = json.load(fp)
 
+    jdata_arginfo = init_reaction_jdata_arginfo()
+    jdata = normalize(jdata_arginfo, jdata)
+
     mdata = convert_mdata(mdata, ["reaxff", "build", "fp"])
     record = "record.reaction"
     iter_rec = -1

diff --git a/dpgen/util.py b/dpgen/util.py
@@ -3,6 +3,8 @@
 from typing import Union, List
 from pathlib import Path
 
+from dargs import Argument
+
 from dpgen import dlog
 
 """
@@ -47,3 +49,24 @@ def expand_sys_str(root_dir: Union[str, Path]) -> List[str]:
     if (root_dir / "type.raw").is_file():
         matches.append(str(root_dir))
     return matches
+
+def normalize(arginfo: Argument, data: dict, strict_check: bool = True) -> dict:
+    """Normalize and check input data.
+
+    Parameters
+    ----------
+    arginfo : dargs.Argument
+        argument information
+    data : dict
+        input data
+    strict_check : bool, default=True
+        strict check data or not
+
+    Returns
+    -------
+    dict
+        normalized data
+    """
+    data = arginfo.normalize_value(data, trim_pattern="_*")
+    arginfo.check_value(data, strict=strict_check)
+    return data
diff --git a/examples/init/reaction.json b/examples/init/reaction.json
@@ -15,5 +15,5 @@
     },
     "cutoff": 3.5,
     "dataset_size": 100,
-    "qmkeywords": "b3lyp/6-31g** force"
+    "qmkeywords": "b3lyp/6-31g** force Geom=PrintInputOrient"
 }
diff --git a/tests/test_check_examples.py b/tests/test_check_examples.py
@@ -0,0 +1,32 @@
+"""This module ensures input in the examples directory
+could pass the argument checking.
+"""
+import unittest
+import json
+from pathlib import Path
+
+from dpgen.util import normalize
+from dpgen.data.arginfo import (
+    init_reaction_jdata_arginfo,
+)
+
+init_reaction_jdata = init_reaction_jdata_arginfo()
+
+# directory of examples
+p_examples = Path(__file__).parent.parent / "examples"
+
+# input_files : tuple[tuple[Argument, Path]]
+#   tuple of example list
+input_files = (
+    (init_reaction_jdata, p_examples / "init" / "reaction.json"),
+)
+
+
+class TestExamples(unittest.TestCase):
+    def test_arguments(self):
+        for arginfo, fn in input_files:
+            fn = str(fn)
+            with self.subTest(fn=fn):
+                with open(fn) as f:
+                    data = json.load(f)
+                normalize(arginfo, data)