From 6cacc3d68d5027179c46bf9b71a32497b5880e17 Mon Sep 17 00:00:00 2001 From: Diomidis Spinellis Date: Sun, 26 Feb 2023 21:47:39 +0200 Subject: [PATCH] Improve CD5 index calculation - Include into the calculation works lacking a reference list - Do not calculate a CD index for such works - Do not calculate a CD index for works published after 2016, because they lack five years of citations to them. - Use graph node properties to implement the above, rather than expensive SQL post-processing. --- examples/cdindex/cdindex-db.cpp | 39 ++++++++++++++++++----------- examples/cdindex/nature-example.sql | 1 - examples/cdindex/yearly-cdindex.sql | 4 +-- 3 files changed, 25 insertions(+), 19 deletions(-) diff --git a/examples/cdindex/cdindex-db.cpp b/examples/cdindex/cdindex-db.cpp index 059ac1c2..854138d0 100644 --- a/examples/cdindex/cdindex-db.cpp +++ b/examples/cdindex/cdindex-db.cpp @@ -20,6 +20,7 @@ #include #include +#define END_YEAR 2021 #define RANGE "published_year BETWEEN 1945 and 2021" // #define RANGE "published_year BETWEEN 1945 and 1946" @@ -29,10 +30,15 @@ using namespace std; const bool use_random_values = false; const int RANDOM_POPULATION_SIZE = 10000000; +// Last timestamp for which CD index can be calculated +timestamp_t HORIZON; + + const int BATCH_SIZE = 10000; // Five years, for calculating the CD_5 index -const time_t DELTA = 5 * 365 * 24 * 60 * 60; +const int DELTA_YEAR = 5; +const time_t DELTA = DELTA_YEAR * 365 * 24 * 60 * 60; // Data associated with vertices class Vdata { @@ -50,6 +56,17 @@ typedef pair work_type; static atomic work_counter = 0; +/* + * Return true if a valid CD index can be calculated for the specified + * node, i.e. if the node has references and its publication time + * allows the establishment of an N-year focal point. + */ +static bool +valid_cd_index(Vertex *v) +{ + return v->get_out_degree() > 0 && v->get_timestamp() <= HORIZON; +} + /* * Calculate CD-index along the passed begin/end range and store it in * the vertice's data @@ -58,7 +75,8 @@ static void worker(work_type &be) { for (auto v = be.first; v != be.second; v++) - s2v.at(v->first).cdindex = cdindex(v->second.vi, DELTA); + if (valid_cd_index(v->second.vi.v)) + s2v.at(v->first).cdindex = cdindex(v->second.vi, DELTA); work_counter += BATCH_SIZE; if (work_counter % 1000000 == 0) cerr << "C " << work_counter << endl; @@ -95,8 +113,7 @@ add_vertices(database &db, Graph &graph) for (auto && row : db << "SELECT doi, published_year," " Coalesce(published_month, 1)," " Coalesce(published_day, 1)" - "FROM works WHERE " RANGE " AND EXISTS" - " (SELECT 1 FROM work_references WHERE work_id == works.id)" + "FROM works WHERE " RANGE ) { string doi; int year, month, day; @@ -159,21 +176,11 @@ int main(int argc, char *argv[]) { Graph graph; + HORIZON = timestamp_from_datetime(END_YEAR - DELTA_YEAR, 12, 31); try { database cdb(argv[1]); - // Create indices - cdb << "CREATE INDEX IF NOT EXISTS works_published_year_idx ON works(published_year)"; - cerr << "Index works_published_year_idx ready" << endl; - - cdb << "CREATE INDEX IF NOT EXISTS works_id_idx ON works(id)"; - cerr << "Index works_id_idx ready" << endl; - - cdb << "CREATE INDEX IF NOT EXISTS work_references_work_id_idx" - " ON work_references(work_id)"; - cerr << "Index work_references_work_id_idx ready" << endl; - if (use_random_values) { add_random_vertices(graph); add_random_edges(graph); @@ -212,6 +219,8 @@ main(int argc, char *argv[]) auto ps = rdb << "INSERT INTO cdindex VALUES(?, ?)"; int counter = 0; for (auto v : s2v) { + if (!valid_cd_index(v.second.vi.v)) + continue; ps << v.first << v.second.cdindex; ps++; if (counter++ % 1000000 == 0) diff --git a/examples/cdindex/nature-example.sql b/examples/cdindex/nature-example.sql index 8be54a6c..995cc152 100644 --- a/examples/cdindex/nature-example.sql +++ b/examples/cdindex/nature-example.sql @@ -5,4 +5,3 @@ SELECT doi,cdindex FROM rolap.cdindex WHERE doi IN ('10.1103/physrev.140.a1133', '10.1038/171737a0'); - diff --git a/examples/cdindex/yearly-cdindex.sql b/examples/cdindex/yearly-cdindex.sql index 04cec2b5..c15d999b 100644 --- a/examples/cdindex/yearly-cdindex.sql +++ b/examples/cdindex/yearly-cdindex.sql @@ -7,6 +7,4 @@ SELECT published_year AS year, Avg(cdindex) AS cdindex FROM rolap.cdindex INNER JOIN works ON works.doi = cdindex.doi WHERE cdindex is not null AND published_year is not null - GROUP BY year - -- Years to 2021 are needed for calculating CD5 - HAVING year <= 2016; + GROUP BY year;