Merge pull request #48 from zijwang/sentence_reordering

Added sentence reordering transformation
GEM-benchmark · Jul 5, 2021 · fec76ce · fec76ce
2 parents 48135d4 + 8e641f7
commit fec76ce
Show file tree

Hide file tree

Showing 6 changed files with 173 additions and 1 deletion.
diff --git a/requirements.txt b/requirements.txt
@@ -1,4 +1,4 @@
-checklist==0.0.10
+checklist==0.0.11
 spacy==2.2.4
 
 # for back_translation

diff --git a/transformations/sentence_reordering/README.md b/transformations/sentence_reordering/README.md
@@ -0,0 +1,50 @@
+# Sentence reordering
+This perturbation adds noise to all types of text sources (sentence, paragraph, etc.) by randomly shuffling sentencesin the input text with coreference resolution to reduce ambiguity.
+
+Author name: Zijian Wang (zijwang@hotmail.com)
+
+## What type of a transformation is this?
+This transformation could shuffle sentence order in the input text, which could test model robustness. 
+
+## What tasks does it intend to benefit?
+This perturbation would benefit all tasks on text classification and generation. 
+
+Benchmark results:
+
+- Sentiment analysis: we run sentiment analysis on a 1% sample of the IMDB dataset. The original accuracy is 956 and the perturbed accuracy is 95.2.
+- Text summarization: we run text summarization on a 1% sample of the xsum dataset. The original BLEU is 15.99 and the perturbed BLEU is 9.75.
+
+## Related work
+
+This is very similar to the `Sentence Permutation` noising method in the BART paper. 
+
+```bibtex
+@inproceedings{lewis2020bart,
+  title={BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension},
+  author={Lewis, Mike and Liu, Yinhan and Goyal, Naman and Ghazvininejad, Marjan and Mohamed, Abdelrahman and Levy, Omer and Stoyanov, Veselin and Zettlemoyer, Luke},
+  booktitle={Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics},
+  pages={7871--7880},
+  year={2020}
+}
+```
+
+The coreference resolution model is from the following paper
+
+```bibtex
+@inproceedings{lee2018higher,
+  title={Higher-Order Coreference Resolution with Coarse-to-Fine Inference},
+  author={Lee, Kenton and He, Luheng and Zettlemoyer, Luke},
+  booktitle={Proceedings of the 2018 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 2 (Short Papers)},
+  pages={687--692},
+  year={2018}
+}
+```
+
+We use its [AllenNLP implementation](https://demo.allennlp.org/coreference-resolution).
+
+
+## What are the limitations of this transformation?
+
+This transformation will only change the input text that has more than one sentence. 
+
+There are still cases where coref only could not handle. For example, there could be ellipsis problems as demonstrated by [this paper on narrative reordering](https://arxiv.org/pdf/2104.06669v1.pdf). We leave these as future work for simplicity. 
diff --git a/transformations/sentence_reordering/__init__.py b/transformations/sentence_reordering/__init__.py
@@ -0,0 +1 @@
+from .transformation import *
diff --git a/transformations/sentence_reordering/requirements.txt b/transformations/sentence_reordering/requirements.txt
@@ -0,0 +1,3 @@
+# for sentence_reordering
+allennlp==2.5.0
+allennlp-models==2.5.0
diff --git a/transformations/sentence_reordering/test.json b/transformations/sentence_reordering/test.json
@@ -0,0 +1,71 @@
+{
+  "type": "sentence_reordering",
+  "test_cases": [
+    {
+      "class": "SentenceReordering",
+      "inputs": {
+        "sentence": "The Novikov conjecture is one of the most important unsolved problems in topology. It is named for Sergei Novikov who originally posed the conjecture in 1965. The Novikov conjecture concerns the homotopy invariance of certain polynomials in the Pontryagin classes of a manifold, arising from the fundamental group. According to the Novikov conjecture, the higher signatures, which are certain numerical invariants of smooth manifolds, are homotopy invariants."
+      },
+      "outputs": [
+        {
+          "sentence": "The Novikov conjecture concerns the homotopy invariance of certain polynomials in the Pontryagin classes of a manifold, arising from the fundamental group. The Novikov conjecture is named for Sergei Novikov who originally posed The Novikov conjecture in 1965. According to The Novikov conjecture, the higher signatures, which are certain numerical invariants of smooth manifolds, are homotopy invariants. The Novikov conjecture is one of the most important unsolved problems in topology."
+        }
+      ]
+    },
+    {
+      "class": "SentenceReordering",
+      "inputs": {
+        "sentence": "Albany Theatre is a historic theater in Albany, Georgia. It was added to the National Register of Historic Places on August 21, 2006. The Albany Theatre opened on September 12, 1927. The theatre is no longer in operation. It is located at 107 North Jackson Street."
+      },
+      "outputs": [
+        {
+          "sentence": "Albany Theatre is no longer in operation. Albany Theatre was added to the National Register of Historic Places on August 21, 2006. Albany Theatre opened on September 12, 1927. Albany Theatre is located at 107 North Jackson Street. Albany Theatre is a historic theater in Albany, Georgia."
+        }
+      ]
+    },
+    {
+      "class": "SentenceReordering",
+      "inputs": {
+        "sentence": "Intertoys is a Dutch store-chain founded in 1976 that specialised in toys, multimedia and electronics. It is headquartered in Amsterdam."
+      },
+      "outputs": [
+        {
+          "sentence": "Intertoys is headquartered in Amsterdam. Intertoys is a Dutch store-chain founded in 1976 that specialised in toys, multimedia and electronics."
+        }
+      ]
+    },
+    {
+      "class": "SentenceReordering",
+      "inputs": {
+        "sentence": "QuantumScape is an American company that does research about solid state lithium metal batteries for electric cars. The company is headquartered in San Jose, California and employs around 200 people. Investors include Bill Gates and Volkswagen."
+      },
+      "outputs": [
+        {
+          "sentence": "QuantumScape is headquartered in San Jose, California and employs around 200 people. QuantumScape is an American company that does research about solid state lithium metal batteries for electric cars. Investors include Bill Gates and Volkswagen."
+        }
+      ]
+    },
+    {
+      "class": "SentenceReordering",
+      "inputs": {
+        "sentence": "Sousmoulins is a commune in the Charente-Maritime department in southwestern France. The Seugne forms part of the commune's northeastern border."
+      },
+      "outputs": [
+        {
+          "sentence": "The Seugne forms part of a commune in the Charente-Maritime department in southwestern France's northeastern border. Sousmoulins is a commune in the Charente-Maritime department in southwestern France."
+        }
+      ]
+    },
+    {
+      "class": "SentenceReordering",
+      "inputs": {
+        "sentence": "John is a great person. He resides in Australia. Peter is also a great person. He resides in India."
+      },
+      "outputs": [
+        {
+          "sentence": "Peter is also a great person. John resides in Australia. Peter resides in India. John is a great person."
+        }
+      ]
+    }
+  ]
+}
diff --git a/transformations/sentence_reordering/transformation.py b/transformations/sentence_reordering/transformation.py
@@ -0,0 +1,47 @@
+import random
+from interfaces.SentenceOperation import SentenceOperation
+from tasks.TaskTypes import TaskType
+
+# for sent tokenizer
+import spacy
+
+# coref resolution from allennlp
+# ref: https://demo.allennlp.org/coreference-resolution
+import allennlp_models.tagging
+from allennlp.predictors.predictor import Predictor
+
+
+"""
+Shuffle sentence order
+"""
+
+
+class SentenceReordering(SentenceOperation):
+    tasks = [
+        TaskType.TEXT_CLASSIFICATION,
+        TaskType.TEXT_TO_TEXT_GENERATION,
+    ]
+    languages = ["en"]
+
+    def __init__(self, seed=42, max_output=1):
+        super().__init__(seed)
+        self.seed = seed
+        self.nlp = spacy.load("en_core_web_sm")
+        self.max_output = max_output
+        self.coref_model = Predictor.from_path(
+            "https://storage.googleapis.com/allennlp-public-models/coref-spanbert-large-2021.03.10.tar.gz"
+        )
+
+    def generate(self, sentence: str):
+        pertubed = [self.sentence_reordering(text=sentence)]
+        return pertubed
+
+    def sentence_reordering(self, text):
+        random.seed(self.seed)
+        # resolve coref
+        text = self.coref_model.coref_resolved(document=text)
+
+        # tokenize and shuffle
+        text_split = [i.text for i in self.nlp(text).sents]
+        random.shuffle(text_split)
+        return " ".join(text_split)