Skip to content

Commit

Permalink
Improve CD5 index calculation
Browse files Browse the repository at this point in the history
- Include into the calculation works lacking a reference list
- Do not calculate a CD index for such works
- Do not calculate a CD index for works published after 2016, because
  they lack five years of citations to them.
- Use graph node properties to implement the above, rather than
  expensive SQL post-processing.
  • Loading branch information
dspinellis committed Feb 26, 2023
1 parent 79de535 commit 6cacc3d
Show file tree
Hide file tree
Showing 3 changed files with 25 additions and 19 deletions.
39 changes: 24 additions & 15 deletions examples/cdindex/cdindex-db.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
#include <sqlite_modern_cpp.h>
#include <cdindex.h>

#define END_YEAR 2021
#define RANGE "published_year BETWEEN 1945 and 2021"
// #define RANGE "published_year BETWEEN 1945 and 1946"

Expand All @@ -29,10 +30,15 @@ using namespace std;
const bool use_random_values = false;
const int RANDOM_POPULATION_SIZE = 10000000;

// Last timestamp for which CD index can be calculated
timestamp_t HORIZON;


const int BATCH_SIZE = 10000;

// Five years, for calculating the CD_5 index
const time_t DELTA = 5 * 365 * 24 * 60 * 60;
const int DELTA_YEAR = 5;
const time_t DELTA = DELTA_YEAR * 365 * 24 * 60 * 60;

// Data associated with vertices
class Vdata {
Expand All @@ -50,6 +56,17 @@ typedef pair<s2v_type::iterator, s2v_type::iterator> work_type;

static atomic<unsigned long long> work_counter = 0;

/*
* Return true if a valid CD index can be calculated for the specified
* node, i.e. if the node has references and its publication time
* allows the establishment of an N-year focal point.
*/
static bool
valid_cd_index(Vertex *v)
{
return v->get_out_degree() > 0 && v->get_timestamp() <= HORIZON;
}

/*
* Calculate CD-index along the passed begin/end range and store it in
* the vertice's data
Expand All @@ -58,7 +75,8 @@ static void
worker(work_type &be)
{
for (auto v = be.first; v != be.second; v++)
s2v.at(v->first).cdindex = cdindex(v->second.vi, DELTA);
if (valid_cd_index(v->second.vi.v))
s2v.at(v->first).cdindex = cdindex(v->second.vi, DELTA);
work_counter += BATCH_SIZE;
if (work_counter % 1000000 == 0)
cerr << "C " << work_counter << endl;
Expand Down Expand Up @@ -95,8 +113,7 @@ add_vertices(database &db, Graph &graph)
for (auto && row : db << "SELECT doi, published_year,"
" Coalesce(published_month, 1),"
" Coalesce(published_day, 1)"
"FROM works WHERE " RANGE " AND EXISTS"
" (SELECT 1 FROM work_references WHERE work_id == works.id)"
"FROM works WHERE " RANGE
) {
string doi;
int year, month, day;
Expand Down Expand Up @@ -159,21 +176,11 @@ int
main(int argc, char *argv[])
{
Graph graph;
HORIZON = timestamp_from_datetime(END_YEAR - DELTA_YEAR, 12, 31);

try {
database cdb(argv[1]);

// Create indices
cdb << "CREATE INDEX IF NOT EXISTS works_published_year_idx ON works(published_year)";
cerr << "Index works_published_year_idx ready" << endl;

cdb << "CREATE INDEX IF NOT EXISTS works_id_idx ON works(id)";
cerr << "Index works_id_idx ready" << endl;

cdb << "CREATE INDEX IF NOT EXISTS work_references_work_id_idx"
" ON work_references(work_id)";
cerr << "Index work_references_work_id_idx ready" << endl;

if (use_random_values) {
add_random_vertices(graph);
add_random_edges(graph);
Expand Down Expand Up @@ -212,6 +219,8 @@ main(int argc, char *argv[])
auto ps = rdb << "INSERT INTO cdindex VALUES(?, ?)";
int counter = 0;
for (auto v : s2v) {
if (!valid_cd_index(v.second.vi.v))
continue;
ps << v.first << v.second.cdindex;
ps++;
if (counter++ % 1000000 == 0)
Expand Down
1 change: 0 additions & 1 deletion examples/cdindex/nature-example.sql
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,3 @@
SELECT doi,cdindex
FROM rolap.cdindex
WHERE doi IN ('10.1103/physrev.140.a1133', '10.1038/171737a0');

4 changes: 1 addition & 3 deletions examples/cdindex/yearly-cdindex.sql
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,4 @@ SELECT published_year AS year, Avg(cdindex) AS cdindex
FROM rolap.cdindex
INNER JOIN works ON works.doi = cdindex.doi
WHERE cdindex is not null AND published_year is not null
GROUP BY year
-- Years to 2021 are needed for calculating CD5
HAVING year <= 2016;
GROUP BY year;

0 comments on commit 6cacc3d

Please sign in to comment.