Skip to content

Commit

Permalink
Vector PRF (#25)
Browse files Browse the repository at this point in the history
* match torchscorer tqdm

* initial commit of the vector based prf

* cleanup

* cleanup

* reworked prf as transformers, added a simple test

* using ``pta.transform.by_query``

* pull forward all query columns

* bibtex block in documentation

* documentation

* whoops

* more tests

---------

Co-authored-by: Sean MacAvaney <sean.macavaney@gmail.com>
  • Loading branch information
cmacdonald and seanmacavaney authored Nov 26, 2024
1 parent 6909587 commit 6fe2564
Show file tree
Hide file tree
Showing 7 changed files with 302 additions and 3 deletions.
3 changes: 2 additions & 1 deletion pyterrier_dr/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,9 @@
from pyterrier_dr.electra import ElectraScorer
from pyterrier_dr.bge_m3 import BGEM3, BGEM3QueryEncoder, BGEM3DocEncoder
from pyterrier_dr.cde import CDE, CDECache
from pyterrier_dr.prf import AveragePrf, VectorPrf

__all__ = ["FlexIndex", "DocnoFile", "NilIndex", "NumpyIndex", "RankedLists", "FaissFlat", "FaissHnsw", "MemIndex", "TorchIndex",
"BiEncoder", "BiQueryEncoder", "BiDocEncoder", "BiScorer", "HgfBiEncoder", "TasB", "RetroMAE", "SBertBiEncoder", "Ance",
"Query2Query", "GTR", "TctColBert", "ElectraScorer", "BGEM3", "BGEM3QueryEncoder", "BGEM3DocEncoder", "CDE", "CDECache",
"SimFn", "infer_device"]
"SimFn", "infer_device", "AveragePrf", "VectorPrf"]
2 changes: 1 addition & 1 deletion pyterrier_dr/flex/np_retr.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def transform(self, inp: pd.DataFrame) -> pd.DataFrame:
ranked_lists = RankedLists(self.num_results, num_q)
batch_it = range(0, dvecs.shape[0], self.batch_size)
if self.flex_index.verbose:
batch_it = pt.tqdm(batch_it)
batch_it = pt.tqdm(batch_it, desc='NumpyRetriever scoring', unit='docbatch')
for idx_start in batch_it:
doc_batch = dvecs[idx_start:idx_start+self.batch_size].T
if self.flex_index.sim_fn == SimFn.cos:
Expand Down
136 changes: 136 additions & 0 deletions pyterrier_dr/prf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
import numpy as np
import pandas as pd
import pyterrier as pt
import pyterrier_alpha as pta


class VectorPrf(pt.Transformer):
"""
Performs a Rocchio-esque PRF by linearly combining the query_vec column with
the doc_vec column of the top k documents.
Arguments:
- alpha: weight of original query_vec
- beta: weight of doc_vec
- k: number of pseudo-relevant feedback documents
Expected Input Columns: ``['qid', 'query_vec', 'docno', 'doc_vec']``
Output Columns: ``['qid', 'query_vec']`` (Any other query columns from the input are also pulled included in the output.)
Example::
prf_pipe = model >> index >> index.vec_loader() >> pyterier_dr.vector_prf() >> index
.. code-block:: bibtex
:caption: Citation
@article{DBLP:journals/tois/0009MZKZ23,
author = {Hang Li and
Ahmed Mourad and
Shengyao Zhuang and
Bevan Koopman and
Guido Zuccon},
title = {Pseudo Relevance Feedback with Deep Language Models and Dense Retrievers:
Successes and Pitfalls},
journal = {{ACM} Trans. Inf. Syst.},
volume = {41},
number = {3},
pages = {62:1--62:40},
year = {2023},
url = {https://doi.org/10.1145/3570724},
doi = {10.1145/3570724},
timestamp = {Fri, 21 Jul 2023 22:26:51 +0200},
biburl = {https://dblp.org/rec/journals/tois/0009MZKZ23.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
"""
def __init__(self,
*,
alpha: float = 1,
beta: float = 0.2,
k: int = 3
):
self.alpha = alpha
self.beta = beta
self.k = k

@pta.transform.by_query(add_ranks=False)
def transform(self, inp: pd.DataFrame) -> pd.DataFrame:
"""Performs Vector PRF on the input dataframe."""
pta.validate.result_frame(inp, extra_columns=['query_vec', 'doc_vec'])

query_cols = [col for col in inp.columns if col.startswith('q') and col != 'query_vec']

# get the docvectors for the top k docs
doc_vecs = np.stack([ row.doc_vec for row in inp.head(self.k).itertuples() ])
# combine their average and add to the query
query_vec = self.alpha * inp['query_vec'].iloc[0] + self.beta * np.mean(doc_vecs, axis=0)
# generate new query dataframe with the existing query columns and the new query_vec
return pd.DataFrame([[inp[c].iloc[0] for c in query_cols] + [query_vec]], columns=query_cols + ['query_vec'])

def __repr__(self):
return f"VectorPrf(alpha={self.alpha}, beta={self.beta}, k={self.k})"


class AveragePrf(pt.Transformer):
"""
Performs Average PRF (as described by Li et al.) by averaging the query_vec column with
the doc_vec column of the top k documents.
Arguments:
- k: number of pseudo-relevant feedback documents
Expected Input Columns: ``['qid', 'query_vec', 'docno', 'doc_vec']``
Output Columns: ``['qid', 'query_vec']`` (Any other query columns from the input are also pulled included in the output.)
Example::
prf_pipe = model >> index >> index.vec_loader() >> pyterier_dr.average_prf() >> index
.. code-block:: bibtex
:caption: Citation
@article{DBLP:journals/tois/0009MZKZ23,
author = {Hang Li and
Ahmed Mourad and
Shengyao Zhuang and
Bevan Koopman and
Guido Zuccon},
title = {Pseudo Relevance Feedback with Deep Language Models and Dense Retrievers:
Successes and Pitfalls},
journal = {{ACM} Trans. Inf. Syst.},
volume = {41},
number = {3},
pages = {62:1--62:40},
year = {2023},
url = {https://doi.org/10.1145/3570724},
doi = {10.1145/3570724},
timestamp = {Fri, 21 Jul 2023 22:26:51 +0200},
biburl = {https://dblp.org/rec/journals/tois/0009MZKZ23.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
"""
def __init__(self,
*,
k: int = 3
):
self.k = k

@pta.transform.by_query(add_ranks=False)
def transform(self, inp: pd.DataFrame) -> pd.DataFrame:
"""Performs Average PRF on the input dataframe."""
pta.validate.result_frame(inp, extra_columns=['query_vec', 'doc_vec'])

query_cols = [col for col in inp.columns if col.startswith('q') and col != 'query_vec']

# get the docvectors for the top k docs and the query_vec
all_vecs = np.stack([inp['query_vec'].iloc[0]] + [row.doc_vec for row in inp.head(self.k).itertuples()])
# combine their average and add to the query
query_vec = np.mean(all_vecs, axis=0)
# generate new query dataframe with the existing query columns and the new query_vec
return pd.DataFrame([[inp[c].iloc[0] for c in query_cols] + [query_vec]], columns=query_cols + ['query_vec'])

def __repr__(self):
return f"AveragePrf(k={self.k})"
8 changes: 8 additions & 0 deletions pyterrier_dr/pt_docs/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -34,3 +34,11 @@ Import ``pyterrier_dr``, load a pre-built index and model, and retrieve:
997 82.551933 10064 10063 997 1 chemical reactions
998 82.546890 4417 4416 998 1 chemical reactions
999 82.545776 7120 7119 999 1 chemical reactions
.. rubric:: Table of Contents

.. toctree::
:maxdepth: 1

prf
16 changes: 16 additions & 0 deletions pyterrier_dr/pt_docs/prf.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
Pseudo Relevance Feedback (PRF)
===============================

Dense Pseudo Relevance Feedback (PRF) is a technique to improve the performance of a retrieval system by expanding the
original query vector with the vectors from the top-ranked documents. The idea is that the top-ranked documents.

PyTerrier-DR provides two dense PRF implementations: :class:`pyterrier_dr.AveragePrf` and :class:`pyterrier_dr.VectorPrf`.

API Documentation
-----------------

.. autoclass:: pyterrier_dr.AveragePrf
:members:

.. autoclass:: pyterrier_dr.VectorPrf
:members:
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
transformers
python-terrier>=0.11.0
pyterrier-alpha>=0.9.3
pyterrier-alpha>=0.10.0
torch
numpy>=1.21.0, <2.0.0
npids
Expand Down
138 changes: 138 additions & 0 deletions tests/test_prf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
import unittest
import numpy as np
import pandas as pd
from pyterrier_dr import AveragePrf, VectorPrf


class TestModels(unittest.TestCase):

def test_average_prf(self):
prf = AveragePrf()
with self.subTest('single row'):
inp = pd.DataFrame([['q1', 'query', np.array([1, 2, 3]), 'd1', np.array([4, 5, 6])]], columns=['qid', 'query', 'query_vec', 'docno', 'doc_vec'])
out = prf(inp)
self.assertEqual(out.columns.tolist(), ['qid', 'query', 'query_vec'])
self.assertEqual(len(out), 1)
self.assertEqual(out['qid'].iloc[0], 'q1')
self.assertEqual(out['query'].iloc[0], 'query')
np.testing.assert_array_equal(out['query_vec'].iloc[0], np.array([2.5, 3.5, 4.5]))

with self.subTest('multiple rows'):
inp = pd.DataFrame([
['q1', 'query', np.array([1, 2, 3]), 'd1', np.array([4, 5, 6])],
['q1', 'query', np.array([1, 2, 3]), 'd2', np.array([1, 4, 2])],
['q1', 'query', np.array([1, 2, 3]), 'd3', np.array([8, 7, 1])],
], columns=['qid', 'query', 'query_vec', 'docno', 'doc_vec'])
out = prf(inp)
self.assertEqual(out.columns.tolist(), ['qid', 'query', 'query_vec'])
self.assertEqual(len(out), 1)
np.testing.assert_array_equal(out['query_vec'].iloc[0], np.array([3.5, 4.5, 3.]))

with self.subTest('multiple rows -- k=3'):
inp = pd.DataFrame([
['q1', 'query', np.array([1, 2, 3]), 'd1', np.array([4, 5, 6])],
['q1', 'query', np.array([1, 2, 3]), 'd2', np.array([1, 4, 2])],
['q1', 'query', np.array([1, 2, 3]), 'd3', np.array([8, 7, 1])],
['q1', 'query', np.array([1, 2, 3]), 'd4', np.array([100, 100, 100])],
], columns=['qid', 'query', 'query_vec', 'docno', 'doc_vec'])
out = prf(inp)
self.assertEqual(out.columns.tolist(), ['qid', 'query', 'query_vec'])
self.assertEqual(len(out), 1)
np.testing.assert_array_equal(out['query_vec'].iloc[0], np.array([3.5, 4.5, 3.]))

with self.subTest('multiple rows -- k=1'):
prf.k = 1
inp = pd.DataFrame([
['q1', 'query', np.array([1, 2, 3]), 'd1', np.array([4, 5, 6])],
['q1', 'query', np.array([1, 2, 3]), 'd2', np.array([1, 4, 2])],
['q1', 'query', np.array([1, 2, 3]), 'd3', np.array([8, 7, 1])],
['q1', 'query', np.array([1, 2, 3]), 'd4', np.array([100, 100, 100])],
], columns=['qid', 'query', 'query_vec', 'docno', 'doc_vec'])
out = prf(inp)
self.assertEqual(out.columns.tolist(), ['qid', 'query', 'query_vec'])
self.assertEqual(len(out), 1)
np.testing.assert_array_equal(out['query_vec'].iloc[0], np.array([2.5, 3.5, 4.5]))

with self.subTest('multiple queries'):
prf.k = 3
inp = pd.DataFrame([
['q1', 'query', np.array([1, 2, 3]), 'd1', np.array([4, 5, 6])],
['q1', 'query', np.array([1, 2, 3]), 'd1', np.array([1, 4, 2])],
['q1', 'query', np.array([1, 2, 3]), 'd1', np.array([8, 7, 1])],
['q2', 'query2', np.array([4, 6, 1]), 'd1', np.array([9, 4, 2])],
], columns=['qid', 'query', 'query_vec', 'docno', 'doc_vec'])
out = prf(inp)
self.assertEqual(out.columns.tolist(), ['qid', 'query', 'query_vec'])
self.assertEqual(len(out), 2)
self.assertEqual(out['qid'].iloc[0], 'q1')
np.testing.assert_array_equal(out['query_vec'].iloc[0], np.array([3.5, 4.5, 3.]))
self.assertEqual(out['qid'].iloc[1], 'q2')
np.testing.assert_array_equal(out['query_vec'].iloc[1], np.array([6.5, 5., 1.5]))

def test_vector_prf(self):
prf = VectorPrf(alpha=0.5, beta=0.5)
with self.subTest('single row'):
inp = pd.DataFrame([['q1', 'query', np.array([1, 2, 3]), 'd1', np.array([4, 5, 6])]], columns=['qid', 'query', 'query_vec', 'docno', 'doc_vec'])
out = prf(inp)
self.assertEqual(out.columns.tolist(), ['qid', 'query', 'query_vec'])
self.assertEqual(len(out), 1)
self.assertEqual(out['qid'].iloc[0], 'q1')
self.assertEqual(out['query'].iloc[0], 'query')
np.testing.assert_array_equal(out['query_vec'].iloc[0], np.array([2.5, 3.5, 4.5]))

with self.subTest('multiple rows'):
inp = pd.DataFrame([
['q1', 'query', np.array([1, 2, 3]), 'd1', np.array([4, 5, 6])],
['q1', 'query', np.array([1, 2, 3]), 'd2', np.array([1, 4, 2])],
['q1', 'query', np.array([1, 2, 3]), 'd3', np.array([8, 7, 1])],
], columns=['qid', 'query', 'query_vec', 'docno', 'doc_vec'])
out = prf(inp)
self.assertEqual(out.columns.tolist(), ['qid', 'query', 'query_vec'])
self.assertEqual(len(out), 1)
np.testing.assert_almost_equal(out['query_vec'].iloc[0], np.array([2.666667, 3.666667, 3.]), decimal=5)

with self.subTest('multiple rows -- k=3'):
inp = pd.DataFrame([
['q1', 'query', np.array([1, 2, 3]), 'd1', np.array([4, 5, 6])],
['q1', 'query', np.array([1, 2, 3]), 'd2', np.array([1, 4, 2])],
['q1', 'query', np.array([1, 2, 3]), 'd3', np.array([8, 7, 1])],
['q1', 'query', np.array([1, 2, 3]), 'd4', np.array([100, 100, 100])],
], columns=['qid', 'query', 'query_vec', 'docno', 'doc_vec'])
out = prf(inp)
self.assertEqual(out.columns.tolist(), ['qid', 'query', 'query_vec'])
self.assertEqual(len(out), 1)
np.testing.assert_almost_equal(out['query_vec'].iloc[0], np.array([2.666667, 3.666667, 3.]), decimal=5)

with self.subTest('multiple rows -- k=1'):
prf.k = 1
inp = pd.DataFrame([
['q1', 'query', np.array([1, 2, 3]), 'd1', np.array([4, 5, 6])],
['q1', 'query', np.array([1, 2, 3]), 'd2', np.array([1, 4, 2])],
['q1', 'query', np.array([1, 2, 3]), 'd3', np.array([8, 7, 1])],
['q1', 'query', np.array([1, 2, 3]), 'd4', np.array([100, 100, 100])],
], columns=['qid', 'query', 'query_vec', 'docno', 'doc_vec'])
out = prf(inp)
self.assertEqual(out.columns.tolist(), ['qid', 'query', 'query_vec'])
self.assertEqual(len(out), 1)
np.testing.assert_array_equal(out['query_vec'].iloc[0], np.array([2.5, 3.5, 4.5]))

with self.subTest('multiple queries'):
prf.k = 3
inp = pd.DataFrame([
['q1', 'query', np.array([1, 2, 3]), 'd1', np.array([4, 5, 6])],
['q1', 'query', np.array([1, 2, 3]), 'd1', np.array([1, 4, 2])],
['q1', 'query', np.array([1, 2, 3]), 'd1', np.array([8, 7, 1])],
['q2', 'query2', np.array([4, 6, 1]), 'd1', np.array([9, 4, 2])],
], columns=['qid', 'query', 'query_vec', 'docno', 'doc_vec'])
out = prf(inp)
self.assertEqual(out.columns.tolist(), ['qid', 'query', 'query_vec'])
self.assertEqual(len(out), 2)
self.assertEqual(out['qid'].iloc[0], 'q1')
np.testing.assert_almost_equal(out['query_vec'].iloc[0], np.array([2.666667, 3.666667, 3.]), decimal=5)
self.assertEqual(out['qid'].iloc[1], 'q2')
np.testing.assert_array_equal(out['query_vec'].iloc[1], np.array([6.5, 5., 1.5]))



if __name__ == '__main__':
unittest.main()

0 comments on commit 6fe2564

Please sign in to comment.