Merge pull request #44 from robinzyb/devel

enhance for parsing restart md files
robinzyb · Apr 12, 2024 · b8c241b · b8c241b
2 parents 1120602 + 54d93c0
commit b8c241b
Show file tree

Hide file tree

Showing 18 changed files with 30,076 additions and 29 deletions.
diff --git a/cp2kdata/block_parser/cells.py b/cp2kdata/block_parser/cells.py
@@ -115,6 +115,7 @@ def parse_all_md_cells(output_file: List[str],
             # convert bohr to angstrom
             cell[:3] = cell[:3] * au2A
             # make sure cell length are in angstrom and cell angles are in degree before sent to cellpar_to_cell
+            #TODO: replace this cellpar_to_cell with more accurate functions in the future
             cell = cellpar_to_cell(cell)
             all_md_cells.append(cell)
     else:
@@ -130,6 +131,7 @@ def parse_all_md_cells(output_file: List[str],
             # convert bohr to angstrom
             cell[:3] = cell[:3] * au2A
             # make sure cell length are in angstrom and cell angles are in degree before sent to cellpar_to_cell
+            #TODO: replace this cellpar_to_cell with more accurate functions in the future
             cell = cellpar_to_cell(cell)
             all_md_cells.append(cell)
 

diff --git a/cp2kdata/dpdata_plugin.py b/cp2kdata/dpdata_plugin.py
@@ -72,7 +72,7 @@ def from_labeled_system(self, file_name, **kwargs):
 @Format.register("cp2k/aimd_output")
 @Format.register("cp2kdata/md")
 class CP2KMDFormat(Format):
-    def from_labeled_system(self, file_name, **kwargs):
+    def from_labeled_system(self, file_name, restart: bool=None, **kwargs):
 
         # -- Set Basic Parameters --
         path_prefix = file_name  # in cp2k md, file_name is directory name.
@@ -84,7 +84,9 @@ def from_labeled_system(self, file_name, **kwargs):
         print(WRAPPER)
 
         cp2kmd = Cp2kOutput(output_file=cp2k_output_name,
-                            run_type="MD", path_prefix=path_prefix)
+                            run_type="MD",
+                            path_prefix=path_prefix,
+                            restart=restart)
 
         num_frames = cp2kmd.get_num_frames()
 

diff --git a/cp2kdata/output.py b/cp2kdata/output.py
@@ -440,21 +440,21 @@ def parse_md(self):
                 "------------------\n"
             )
 
-        WARNING_MSG = "cp2kdata obtains more than one initial cell from the output file, \
-                    please check if your output file has duplicated header information."
 
         cell_file_list = glob.glob(os.path.join(self.path_prefix, "*.cell"))
         if (self.md_info.ensemble_type == "NVT") or \
             (self.md_info.ensemble_type == "NVE") or \
-                (self.md_info.ensemble_type == "REFTRAJ"):
+                (self.md_info.ensemble_type == "REFTRAJ"): # not ture REFTRAJ also contrains different cell?
             if cell_file_list:
                 self.all_cells = parse_md_cell(cell_file_list[0])
             elif self.filename:
                 format_logger(info="Cells", filename=self.filename)
                 print(WARNING_MSG_PARSE_CELL_FROM_OUTPUT)
+
+                #self.organize_md_cell()
                 # parse the first cell
                 first_cell = parse_all_cells(self.output_file)
-                assert first_cell.shape == (1, 3, 3), WARNING_MSG
+                assert first_cell.shape == (1, 3, 3)
                 self.all_cells = first_cell
                 self.all_cells = np.repeat(
                     self.all_cells, repeats=self.num_frames, axis=0)
@@ -466,37 +466,40 @@ def parse_md(self):
             elif self.filename:
                 format_logger(info="Cells", filename=self.filename)
                 print(WARNING_MSG_PARSE_CELL_FROM_OUTPUT)
-                # only parse the first cell
-                first_cell = parse_all_cells(self.output_file)
-                assert first_cell.shape == (1, 3, 3), WARNING_MSG
-                # parse the rest of the cells
-                self.all_cells = parse_all_md_cells(self.output_file,
-                                                    cp2k_info=self.cp2k_info)
-                # prepend the first cell
-                self.all_cells = np.insert(
-                    self.all_cells, 0, first_cell[0], axis=0)
+
+                self.organize_md_cell()
 
         elif (self.md_info.ensemble_type == "NPT_I"):
             if cell_file_list:
                 self.all_cells = parse_md_cell(cell_file_list[0])
             elif self.filename:
                 format_logger(info="Cells", filename=self.filename)
                 print(WARNING_MSG_PARSE_CELL_FROM_OUTPUT)
-                # only parse the first cell
-                first_cell = parse_all_cells(self.output_file)
-                assert first_cell.shape == (1, 3, 3), WARNING_MSG
-                # parse the rest of the cells
-                self.all_cells = parse_all_md_cells(self.output_file,
-                                                    cp2k_info=self.cp2k_info,
-                                                    init_cell_info=first_cell[0])
-                # prepend the first cell
-                self.all_cells = np.insert(
-                    self.all_cells, 0, first_cell[0], axis=0)
+
+                self.organize_md_cell()
 
         self.init_atomic_coordinates, self.atom_kind_list, self.chemical_symbols = parse_init_atomic_coordinates(
             self.output_file)
         self.atomic_kind = parse_atomic_kinds(self.output_file)
 
+    def organize_md_cell(self):
+        # whether reserve the first cell is determined by the restart
+
+        WARNING_MSG = "cp2kdata obtains more than one initial cell from the output file, \
+                    please check if your output file has duplicated header information."
+
+        # only parse the first cell
+        first_cell = parse_all_cells(self.output_file)
+        assert first_cell.shape == (1, 3, 3), WARNING_MSG
+        # parse the rest of the cells
+        self.all_cells = parse_all_md_cells(self.output_file,
+                                            cp2k_info=self.cp2k_info,
+                                            init_cell_info=first_cell[0])
+        # prepend the first cell
+        if self.cp2k_info.restart is not True:
+            self.all_cells = np.insert(
+                self.all_cells, 0, first_cell[0], axis=0)
+
     @staticmethod
     def get_global_info(run_type=None, filename=None):
         if filename:

diff --git a/docs/dpdata_plugin.md b/docs/dpdata_plugin.md
@@ -4,6 +4,8 @@
 
 For instructions on how to use `dpdata`, please refer to the official repository: https://github.com/deepmodeling/dpdata.
 
+In the following, we provide two exmples that demonstrate how to use `Cp2kData` with `dpdata` to parse data from CP2K simulations in specified formats.
+
 Currently, `CP2KData` supports two formats for use with `dpdata`:
 
 1. `cp2kdata/e_f` format for parsing `ENERGY_FORCE` outputs.
@@ -93,4 +95,45 @@ Currently, `CP2KData` supports two formats for use with `dpdata`:
    &END MOTION
    ```
 
-These examples demonstrate how to use `Cp2kData` with `dpdata` to parse and work with data from CP2K simulations in the specified formats.
+
+   In some cases, cp2k md simulations are restarted from `-1.restart` file in which the initial structure will not be evaluated again.
+   Therefore, the initial cell information should not be parsed again. Otherwise, the number of frames for cells is inconsistent with those for `poses`, `forces`, and `energies`.
+   Cp2kdata can automatically check whether the simulations are restarted or not through the header information of output:
+   ```
+    *******************************************************************************
+    *                            RESTART INFORMATION                              *
+    *******************************************************************************
+    *                                                                             *
+    *    RESTART FILE NAME: bivo4-water-1.restart                                 *
+    *                                                                             *
+    * RESTARTED QUANTITIES:                                                       *
+    *                       CELL                                                  *
+    *                       COORDINATES                                           *
+    *                       RANDOM NUMBER GENERATOR                               *
+    *                       VELOCITIES                                            *
+    *                       MD COUNTERS                                           *
+    *                       MD AVERAGES                                           *
+    *                       PARTICLE THERMOSTAT                                   *
+    *                       REAL TIME PROPAGATION                                 *
+    *                       PINT BEAD POSITIONS                                   *
+    *                       PINT BEAD VELOCITIES                                  *
+    *                       PINT NOSE THERMOSTAT                                  *
+    *                       PINT GLE THERMOSTAT                                   *
+    *                       HELIUM BEAD POSITIONS                                 *
+    *                       HELIUM PERMUTATION STATE                              *
+    *                       HELIUM FORCES ON SOLUTE                               *
+    *                       HELIUM RNG STATE                                      *
+    *******************************************************************************
+   ```
+   if the simulations are restarted using:
+   ```cp2k
+   &EXT_RESTART
+      RESTART_FILE_NAME Li-LiFSI-DME-1-2-1.restart
+   &END EXT_RESTART
+   ```
+   In case your restarted output doesn't have the above header, you can explicitly tell the cp2kdata/dpdata by setting `restart=True`,
+   ```python
+   # restart = True in case the output doesn't contains header
+   dp = dpdata.LabeledSystem(cp2kmd_dir, cp2k_output_name=cp2kmd_output_name, fmt="cp2kdata/md", restart=True)
+   ```
+
diff --git a/pyproject.toml b/pyproject.toml
@@ -7,7 +7,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "Cp2kData"
-version = "0.6.6"
+version = "0.6.7"
 description = "A Small Package to Postprocess Cp2k Output"
 authors = [
     {name = "Yongbin Zhuang", email = "[email protected]"}

diff --git a/tests/test_dpdata/test_labeledsys.py b/tests/test_dpdata/test_labeledsys.py
@@ -20,7 +20,8 @@
     "tests/test_dpdata/v2022.1/aimd",
     "tests/test_dpdata/v2022.2/aimd_npt_i",
     "tests/test_dpdata/v2023.1/aimd_nvt",
-    "tests/test_dpdata/v2023.1/aimd_npt_f"
+    "tests/test_dpdata/v2023.1/aimd_npt_f",
+    "tests/test_dpdata/v2024.1/aimd_npt_i_restart"
 ]
 
 e_f_dpdata_list = [
@@ -97,7 +98,7 @@ def test_cell(self, cp2k_and_ref):
         if not cp2k_and_ref[0].nopbc and not cp2k_and_ref[1].nopbc:
             np.testing.assert_almost_equal(cp2k_and_ref[0].data['cells'],
                                         cp2k_and_ref[1].data['cells'],
-                                        decimal = 6,
+                                        decimal = 4,
                                         err_msg = 'cell failed')
 
     def test_coord(self, cp2k_and_ref):

diff --git a/tests/test_dpdata/v2023.1/aimd_npt_f/deepmd/set.000/box.npy b/tests/test_dpdata/v2023.1/aimd_npt_f/deepmd/set.000/box.npy
diff --git a/tests/test_dpdata/v2024.1/aimd_npt_i_restart/Li-LiFSI-DME-1-1-1.ener b/tests/test_dpdata/v2024.1/aimd_npt_i_restart/Li-LiFSI-DME-1-1-1.ener
@@ -0,0 +1,6 @@
+#     Step Nr.          Time[fs]        Kin.[a.u.]          Temp[K]            Pot.[a.u.]        Cons Qty[a.u.]        UsedTime[s]
+         6            6.000000         3.618561712       709.941006762     -8580.312466062     -8576.705940797      1856.756846905
+         7            7.000000         3.774142332       740.465030228     -8580.480831882     -8576.708048669       160.384975910
+         8            8.000000         3.861779396       757.658918417     -8580.592783729     -8576.709931201       119.853107214
+         9            9.000000         3.897660921       764.698667254     -8580.627119159     -8576.708212095       120.193082809
+        10           10.000000         3.863355283       757.968098316     -8580.607578460     -8576.707023605       120.599007130