Skip to content

Commit

Permalink
Support more complex formula formats in `aiida.orm.data.cif.parse_for…
Browse files Browse the repository at this point in the history
…mula` (#3954)

The new implementation now not only supports the very strict format of
the `_chemical_formula` tag of the CIF file format, but is also allows
more complex versions of the Hill notation, including element groups
denoted by curly/square brackets or parentheses.

Additionally, `CifData.get_formulae` has a new optional argument called
`custom_tags` which takes a single string or list of string that
correspond to CIF tags other than the default `_chemical_formula_{}`.
Certain CIF file provides provide the formulae in these non-default tags.
  • Loading branch information
lorisercole authored May 7, 2020
1 parent a038876 commit 6220e85
Show file tree
Hide file tree
Showing 3 changed files with 99 additions and 40 deletions.
67 changes: 45 additions & 22 deletions aiida/orm/nodes/data/cif.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
# pylint: disable=invalid-name,too-many-locals,too-many-statements
"""Tools for handling Crystallographic Information Files (CIF)"""

import re
from aiida.common.utils import Capturing

from .singlefile import SinglefileData
Expand Down Expand Up @@ -196,27 +197,46 @@ def pycifrw_from_cif(datablocks, loops=None, names=None):

def parse_formula(formula):
"""
Parses the Hill formulae, written with spaces for separators.
Parses the Hill formulae. Does not need spaces as separators.
Works also for partial occupancies and for chemical groups enclosed in round/square/curly brackets.
Elements are counted and a dictionary is returned.
e.g. 'C[NH2]3NO3' --> {'C': 1, 'N': 4, 'H': 6, 'O': 3}
"""
import re

def chemcount_str_to_number(string):
if not string:
quantity = 1
else:
quantity = float(string)
if quantity.is_integer():
quantity = int(quantity)
return quantity

contents = {}
for part in re.split(r'\s+', formula):
m = re.match(r'(\D+)([\.\d]+)?', part)

if m is None:
# split blocks with parentheses
for block in re.split(r'(\([^\)]*\)[^A-Z\(\[\{]*|\[[^\]]*\][^A-Z\(\[\{]*|\{[^\}]*\}[^A-Z\(\[\{]*)', formula):
if not block: # block is void
continue

specie = m.group(1)
quantity = m.group(2)
if quantity is None:
quantity = 1
# get molecular formula (within parentheses) & count
group = re.search(r'[\{\[\(](.+)[\}\]\)]([\.\d]*)', block)
if group is None: # block does not contain parentheses
molformula = block
molcount = 1
else:
if re.match(r'^\d+$', quantity):
quantity = int(quantity)
else:
quantity = float(quantity)
contents[specie] = quantity
molformula = group.group(1)
molcount = chemcount_str_to_number(group.group(2))

for part in re.findall(r'[A-Z][^A-Z\s]*', molformula.replace(' ', '')): # split at uppercase letters
match = re.match(r'(\D+)([\.\d]+)?', part) # separates element and count

if match is None:
continue

species = match.group(1)
quantity = chemcount_str_to_number(match.group(2)) * molcount
contents[species] = contents.get(species, 0) + quantity
return contents


Expand Down Expand Up @@ -527,7 +547,7 @@ def set_parse_policy(self, parse_policy):
else:
raise ValueError('Got unknown parse_policy {}'.format(parse_policy))

def get_formulae(self, mode='sum'):
def get_formulae(self, mode='sum', custom_tags=None):
"""
Return chemical formulae specified in CIF file.
Expand All @@ -536,12 +556,19 @@ def get_formulae(self, mode='sum'):
"""
# note: If formulae are not None, they could be returned
# directly (but the function is very cheap anyhow).
formula_tag = '_chemical_formula_{}'.format(mode)
formula_tags = ['_chemical_formula_{}'.format(mode)]
if custom_tags:
if not isinstance(custom_tags, (list, tuple)):
custom_tags = [custom_tags]
formula_tags.extend(custom_tags)

formulae = []
for datablock in self.values.keys():
formula = None
if formula_tag in self.values[datablock].keys():
formula = self.values[datablock][formula_tag]
for formula_tag in formula_tags:
if formula_tag in self.values[datablock].keys():
formula = self.values[datablock][formula_tag]
break
formulae.append(formula)

return formulae
Expand Down Expand Up @@ -577,8 +604,6 @@ def has_partial_occupancies(self):
:return: True if there are partial occupancies, False otherwise
"""
import re

tag = '_atom_site_occupancy'

epsilon = 1e-6
Expand Down Expand Up @@ -628,8 +653,6 @@ def has_undefined_atomic_sites(self):
:return: boolean, True if no atomic sites are defined or if any of the defined sites contain undefined positions
and False otherwise
"""
import re

tag_x = '_atom_site_fract_x'
tag_y = '_atom_site_fract_y'
tag_z = '_atom_site_fract_z'
Expand Down
54 changes: 54 additions & 0 deletions tests/orm/data/test_cif.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
# -*- coding: utf-8 -*-
###########################################################################
# Copyright (c), The AiiDA team. All rights reserved. #
# This file is part of the AiiDA code. #
# #
# The code is hosted on GitHub at https://github.com/aiidateam/aiida-core #
# For further information on the license, see the LICENSE.txt file #
# For further information please visit http://www.aiida.net #
###########################################################################
"""Tests for cif related functions."""

import pytest

from aiida.orm.nodes.data.cif import parse_formula


def test_parse_formula():
"""Test the `parse_formula` utility function."""
assert parse_formula('C H') == {'C': 1, 'H': 1}
assert parse_formula('C5 H1') == {'C': 5, 'H': 1}
assert parse_formula('Ca5 Ho') == {'Ca': 5, 'Ho': 1}
assert parse_formula('H0.5 O') == {'H': 0.5, 'O': 1}
assert parse_formula('C0 O0') == {'C': 0, 'O': 0}
assert parse_formula('C1 H1 ') == {'C': 1, 'H': 1}
assert parse_formula(' C1 H1') == {'C': 1, 'H': 1}
assert parse_formula('CaHClO') == {'Ca': 1, 'H': 1, 'Cl': 1, 'O': 1}
assert parse_formula('C70 H108 Al4 La4 N4 O10') == {'C': 70, 'H': 108, 'Al': 4, 'La': 4, 'N': 4, 'O': 10}
assert parse_formula('C70H108Al4Li4N4O10') == {'C': 70, 'H': 108, 'Al': 4, 'Li': 4, 'N': 4, 'O': 10}
assert parse_formula('C36 H59LiN2 O3 Si') == {'C': 36, 'H': 59, 'Li': 1, 'N': 2, 'O': 3, 'Si': 1}
assert parse_formula('C63.5H83.5Li2N2O3.25P2') == {'C': 63.5, 'H': 83.5, 'Li': 2, 'N': 2, 'O': 3.25, 'P': 2}
assert parse_formula('Fe Li0.667 O4 P1') == {'Fe': 1, 'Li': 0.667, 'O': 4, 'P': 1}
assert parse_formula('Fe2.05Ni0.05O4 Zn0.9') == {'Fe': 2.05, 'Ni': 0.05, 'O': 4, 'Zn': 0.9}
assert parse_formula('Li3O6(Al0.23Li0.77)2(Li0.07Te0.93)') == {'Li': 4.61, 'O': 6, 'Al': 0.46, 'Te': 0.93}
assert parse_formula('Li2{Cr0.05Li0.95X0.00}{Cr0.24Li0.76}2{Li0.02Te0.98}O6') == {
'Li': 4.49,
'Cr': 0.53,
'X': 0,
'Te': 0.98,
'O': 6
}
assert parse_formula('C[NH2]3NO3') == {'C': 1, 'N': 4, 'H': 6, 'O': 3}
assert parse_formula('H80 C104{C0.50 X0.50}8N8 Cl4(Cl0.50X0.50)8.0O8') == {
'H': 80,
'C': 108.0,
'X': 8.0,
'N': 8,
'Cl': 8.0,
'O': 8
}
assert parse_formula('Na1.28[NH]0.28{N H2}0.72') == {'Na': 1.28, 'N': 1.0, 'H': 1.72}

for test_formula in ('H0.5.2 O', 'Fe2.05Ni0.05.4', 'Na1.28[NH]0.28.3{NH2}0.72'):
with pytest.raises(ValueError):
parse_formula(test_formula)
18 changes: 0 additions & 18 deletions tests/test_dataclasses.py
Original file line number Diff line number Diff line change
Expand Up @@ -575,24 +575,6 @@ def test_refine(self):
with self.assertRaises(ValueError):
ret_dict = refine_inline(c)

@unittest.skipIf(not has_ase(), 'Unable to import ase')
@unittest.skipIf(not has_pycifrw(), 'Unable to import PyCifRW')
@unittest.skipIf(not has_spglib(), 'Unable to import spglib')
def test_parse_formula(self):
from aiida.orm.nodes.data.cif import parse_formula

self.assertEqual(parse_formula('C H'), {'C': 1, 'H': 1})
self.assertEqual(parse_formula('C5 H1'), {'C': 5, 'H': 1})
self.assertEqual(parse_formula('Ca5 Ho'), {'Ca': 5, 'Ho': 1})
self.assertEqual(parse_formula('H0.5 O'), {'H': 0.5, 'O': 1})
self.assertEqual(parse_formula('C0 O0'), {'C': 0, 'O': 0})
self.assertEqual(parse_formula('C1 H1 '), {'C': 1, 'H': 1}) # Trailing spaces should be accepted
self.assertEqual(parse_formula(' C1 H1'), {'C': 1, 'H': 1}) # Leading spaces should be accepted

# Invalid literal for float()
with self.assertRaises(ValueError):
parse_formula('H0.5.2 O')

@unittest.skipIf(not has_pycifrw(), 'Unable to import PyCifRW')
def test_scan_type(self):
"""Check that different scan_types of PyCifRW produce the same result."""
Expand Down

0 comments on commit 6220e85

Please sign in to comment.