Re: NDCG
- Subject: Re: NDCG
- From: Ian Soboroff <ian.soboroff@nist.gov>
- Date: Thu, 29 Nov 2007 15:34:00 -0500
- Content-Type: multipart/mixed; boundary="=-=-="
- In-Reply-To: <9cfsl2shp7c.fsf@rogue.ncsl.nist.gov> (Ian Soboroff's message of "Mon, 26 Nov 2007 12:49:27 -0500")
- User-Agent: Gnus/5.110007 (No Gnus v0.7) Emacs/22.1.50 (darwin)
Ian Soboroff <ian.soboroff@nist.gov> writes:
> Is there a strong desire out there for a standardized NDCG
> implementation? How should it work?
Well, in the absence of further comment, I offer here a patch to
trec_eval to compute NDCG. It's called "ndcg_trec" in the output, thus
distinguishing it from whatever version of NDCG you happen to like
better. This patch applies to trec_eval version 8.1, you can find that
on the TREC website.
It discounts using log2(rank + 1) for all ranks. Gains are settable on
the command line, and default to the value of the relevance judgment.
(That means that you could use the rel field in the qrels file to
document "official gain values".)
I think that this will wind up in a future version of trec_eval, so this
is a good opportunity to argue about the implementation if you want to.
Ian
diff -uN trec_eval.8.1/README trec_eval.8.1.ndcg/README
--- trec_eval.8.1/README 2006-07-24 09:03:27.000000000 -0400
+++ trec_eval.8.1.ndcg/README 2007-11-29 09:45:30.000000000 -0500
@@ -244,6 +244,7 @@
P200 Precision after 200 docs retrieved
P500 Precision after 500 docs retrieved
P1000 Precision after 1000 docs retrieved
+ndcg_trec Normalized discounted cumulative gain after 1000 docs retrieved
Minor measures with their relational names:
diff -uN trec_eval.8.1/README.NDCG trec_eval.8.1.ndcg/README.NDCG
--- trec_eval.8.1/README.NDCG 1969-12-31 19:00:00.000000000 -0500
+++ trec_eval.8.1.ndcg/README.NDCG 2007-11-29 10:36:03.000000000 -0500
@@ -0,0 +1,21 @@
+
+Adding NDCG to trec_eval
+
+Current state:
+
+- The NDCG here is according to Jarvelin and Kekalainen (ACM ToIS
+ v. 20, pp. 422-446, 2002), with the modification that the discount
+ is always log2(rank + 1). That is, the discount function is always
+ a base-2 log, and the amount is one more than the rank (so that rank
+ 1 is not a special case).
+
+- Gain values default to the relevance values in the qrels file. If
+ you want different gains, use the -G command line option to specify
+ them.
+
+- The measure is reported as a short-output measure, to aid debugging.
+ The name is "ndcg_trec", to distinguish it from other applications
+ of NDCG in the literature.
+
+
+
diff -uN trec_eval.8.1/get_qrels.c trec_eval.8.1.ndcg/get_qrels.c
--- trec_eval.8.1/get_qrels.c 2004-10-09 15:00:13.000000000 -0400
+++ trec_eval.8.1.ndcg/get_qrels.c 2007-11-29 14:29:18.000000000 -0500
@@ -137,6 +137,12 @@
if (NULL == (current_qrels->text_qrels =
Malloc (INIT_NUM_RELS, TEXT_QRELS)))
return (UNDEF);
+ current_qrels->max_num_rel_levels = INIT_NUM_REL_LEVELS;
+ if (NULL == (current_qrels->rel_count =
+ Malloc (INIT_NUM_REL_LEVELS, long)))
+ return (UNDEF);
+ bzero(current_qrels->rel_count,
+ current_qrels->max_num_rel_levels * sizeof(long));
all_trec_qrels->num_q_qrels++;
}
else {
@@ -160,8 +166,21 @@
current_qrels->text_qrels[current_qrels->num_text_qrels].docno =
docno_ptr;
rel = atol (rel_ptr);
+ if (rel >= current_qrels->max_num_rel_levels) {
+ i = rel + 1;
+ if (NULL == (current_qrels->rel_count =
+ Realloc (current_qrels->rel_count, i, long)))
+ return (UNDEF);
+ while (current_qrels->max_num_rel_levels < i) {
+ current_qrels->rel_count[current_qrels->max_num_rel_levels] = 0;
+ current_qrels->max_num_rel_levels++;
+ }
+ current_qrels->max_num_rel_levels = i;
+ }
current_qrels->text_qrels[current_qrels->num_text_qrels++].rel =
rel;
+ if (rel >= 0)
+ current_qrels->rel_count[rel]++;
}
return (1);
diff -uN trec_eval.8.1/measures.c trec_eval.8.1.ndcg/measures.c
--- trec_eval.8.1/measures.c 2006-07-24 08:15:00.000000000 -0400
+++ trec_eval.8.1.ndcg/measures.c 2007-11-29 09:46:55.000000000 -0500
@@ -178,6 +178,8 @@
0, 0, 0, 0, 1, 0, 0, 1, offsetof(TREC_EVAL, gm_bpref)},
{"rank_first_rel", "Rank of top relevant document (0 if none)",
1, 0, 0, 1, 0, 0, 0, 0, offsetof(TREC_EVAL, rank_first_rel)},
+ {"ndcg_trec", "Normalized discounted cumulative gain after 1000 docs retrieved",
+ 0, 1, 0, 0, 0, 1, 0, 0, offsetof(TREC_EVAL, norm_disc_cum_gain)},
};
int num_sing_meas = sizeof (sing_meas) / sizeof (sing_meas[0]);
Common subdirectories: trec_eval.8.1/test and trec_eval.8.1.ndcg/test
diff -uN trec_eval.8.1/trec_eval.c trec_eval.8.1.ndcg/trec_eval.c
--- trec_eval.8.1/trec_eval.c 2006-06-22 11:16:43.000000000 -0400
+++ trec_eval.8.1.ndcg/trec_eval.c 2007-11-29 15:03:56.000000000 -0500
@@ -43,6 +43,9 @@
*7 -ud<num>: Value to use for 'd' coefficient of utility computation.
*7 -N<num>: Number of docs in collection
*7 -M<num>:Max number of results to evaluate per topic
+ *7 -G<lvl>=<num>,<lvl>=<num>...: Gain values for NDCG computation. lvl
+ *7 is a relevance level, num is a floating-point gain value. Default
+ *7 gain for any level is the level value itself.
*8 Procedure is to read all the docs retrieved for a query, and all the
*8 relevant docs for that query,
@@ -64,6 +67,7 @@
void print_error();
void old_print_trec_eval_list();
void print_rel_trec_eval_list();
+static int gain_map();
int trec_eval_help(EVAL_PARAM_INFO *epi);
int accumulate_results (TREC_EVAL *query_eval, TREC_EVAL *accum_eval);
@@ -72,7 +76,7 @@
int form_trvec (EVAL_PARAM_INFO *ep, TREC_TOP *trec_top,
TREC_QRELS *trec_qrels, TR_VEC *tr_vec, long *num_rel);
int trvec_trec_eval (EVAL_PARAM_INFO *epi, TR_VEC *tr_vec,
- TREC_EVAL *eval, long num_rel, long num_nonrel);
+ TREC_EVAL *eval, TREC_QRELS *trec_qrels);
static char *usage = "Usage: trec_eval [-h] [-q] [-a] [-o] [-v] trec_rel_file trec_top_file\n\
-h: Give full help information, including other options\n\
@@ -91,9 +95,11 @@
ALL_TREC_QRELS all_trec_qrels;
TREC_EVAL accum_eval, query_eval;
TR_VEC tr_vec;
- long num_rel;
long num_eval_q;
long i,j;
+ char *c;
+ long lvl;
+ double gain;
EVAL_PARAM_INFO epi;
/* Initialize static info before getting program optional args */
@@ -105,6 +111,12 @@
epi.num_docs_in_coll = 0;
epi.relevance_level = 1;
epi.max_num_docs_per_topic = MAXLONG;
+ epi.max_gains = INIT_NUM_REL_LEVELS;
+ if (NULL == (epi.gain = Malloc(INIT_NUM_REL_LEVELS, double)))
+ exit(0);
+
+ for (i = 0; i < INIT_NUM_REL_LEVELS; i++)
+ epi.gain[i] = i;
/* Should use getopts, but some people may not have it. */
/* This keeps growing over the years. Should redo */
@@ -152,6 +164,37 @@
}
else if (argv[1][1] == 'T')
epi.time_flag++;
+ else if (argv[1][1] == 'G') {
+ c = &argv[1][2];
+ while (*c != '\0') {
+ lvl = atoi(c);
+ if (lvl < 0) {
+ (void) fputs (usage,stderr);
+ exit (1);
+ }
+ while (*c != '=' && *c != '\0')
+ c++;
+ if (*c != '=') {
+ (void) fputs (usage,stderr);
+ exit (1);
+ }
+ c++;
+ gain = atof(c);
+ if (lvl >= epi.max_gains) {
+ j = lvl + 1;
+ if (NULL == (epi.gain = Realloc(epi.gain, j, double)))
+ exit(0);
+ for (i = epi.max_gains; i < j; i++)
+ epi.gain[i] = i;
+ epi.max_gains = j;
+ }
+ epi.gain[lvl] = gain;
+ while (*c != ',' && *c != '\0')
+ c++;
+ if (*c == ',')
+ c++;
+ }
+ }
else {
(void) fputs (usage,stderr);
exit (1);
@@ -163,7 +206,7 @@
(void) fputs (usage,stderr);
exit (1);
}
-
+
trec_rel_file = argv[1];
trec_top_file = argv[2];
@@ -175,6 +218,30 @@
exit (2);
}
+ /* Check the qrels to see if there are relevance values outside
+ the range of known relevance values in the gain mapping. Assign
+ defaults. */
+ j = epi.max_gains;
+ for (i = 0; i < all_trec_qrels.num_q_qrels; i++)
+ if (all_trec_qrels.trec_qrels[i].max_num_rel_levels > j)
+ j = all_trec_qrels.trec_qrels[i].max_num_rel_levels;
+ if (j > epi.max_gains) {
+ if (NULL == (epi.gain = Realloc(epi.gain, j, double)))
+ exit(2);
+ for (i = epi.max_gains; i < j; i++) {
+ epi.gain[i] = i;
+ }
+ epi.max_gains = j;
+ }
+
+ /* Make a mapping of the NDCG gain values in increasing order.
+ This is used in computing the ideal gain vector. */
+ if (NULL == (epi.gain_order = Malloc(epi.max_gains, long)))
+ exit(2);
+ for (i = 0; i < epi.max_gains; i++)
+ epi.gain_order[i] = i;
+ qsort_r(epi.gain_order, epi.max_gains, sizeof(long), &epi, gain_map);
+
/* For each topic which has both qrels and top results information,
calculate, possibly print (if query_flag), and accumulate
evaluation measures. */
@@ -192,12 +259,15 @@
if (j >= all_trec_qrels.num_q_qrels)
continue;
+ /* Initialize eval struct to 0 */
+ bzero ((char *) &query_eval, sizeof (TREC_EVAL));
+
/* Form results/rel into SMART TR_VEC form */
if (UNDEF == form_trvec (&epi,
&all_trec_top.trec_top[i],
&all_trec_qrels.trec_qrels[j],
&tr_vec,
- &num_rel)) {
+ &query_eval.num_rel)) {
print_error ("trec_eval: form_tr_vec error", "Quit");
exit (3);
}
@@ -206,8 +276,7 @@
if (UNDEF == trvec_trec_eval (&epi,
&tr_vec,
&query_eval,
- num_rel,
- all_trec_qrels.trec_qrels[j].num_text_qrels - num_rel)) {
+ &all_trec_qrels.trec_qrels[j])) {
print_error ("trec_eval: evaluation error", "Quit");
exit (4);
}
@@ -255,3 +324,17 @@
exit (0);
}
+
+static int
+gain_map (epi, ptr1, ptr2)
+EVAL_PARAM_INFO *epi;
+const long *ptr1;
+const long *ptr2;
+{
+ if (epi->gain[*ptr1] < epi->gain[*ptr2])
+ return -1;
+ else if (epi->gain[*ptr1] > epi->gain[*ptr2])
+ return 1;
+ else
+ return 0;
+}
diff -uN trec_eval.8.1/trec_eval.h trec_eval.8.1.ndcg/trec_eval.h
--- trec_eval.8.1/trec_eval.h 2006-07-24 08:07:04.000000000 -0400
+++ trec_eval.8.1.ndcg/trec_eval.h 2007-11-29 14:54:52.000000000 -0500
@@ -28,6 +28,9 @@
which a doc is considered relevant for
this evaluation */
long max_num_docs_per_topic; /* MAXLONG. evaluate only this many docs */
+ long max_gains; /* INIT_NUM_REL_LEVELS. */
+ double *gain; /* Gain values to use for NDCG */
+ long *gain_order; /* Lookup for increasing order of gains */
} EVAL_PARAM_INFO;
/* Measure characteristics (how to print them, average them). */
@@ -119,6 +122,8 @@
long num_text_qrels; /* number of judged documents */
long max_num_text_qrels; /* Num docs space reserved for */
TEXT_QRELS *text_qrels; /* Array of judged TEXT_QRELS */
+ long max_num_rel_levels; /* Number of relevance judgment levels */
+ long *rel_count; /* Count of docs with each judgment */
} TREC_QRELS;
typedef struct { /* Overall relevance judgements */
@@ -132,6 +137,7 @@
#define INIT_NUM_QUERIES 50
#define INIT_NUM_RESULTS 1000
#define INIT_NUM_RELS 2000
+#define INIT_NUM_REL_LEVELS 3
/* Non standard values for tr_vec->rel field */
#define RELVALUE_NONPOOL -1
@@ -234,6 +240,7 @@
relevant document */
long rank_first_rel; /* Rank of top retrieved rel doc. Set to
0 if none. Unaveraged */
+ float norm_disc_cum_gain; /* Normalized discounted cumulative gain */
/* Measures after each document */
float recall_cut[NUM_CUTOFF]; /* Recall after cutoff[i] docs */
diff -uN trec_eval.8.1/trec_eval_help.c trec_eval.8.1.ndcg/trec_eval_help.c
--- trec_eval.8.1/trec_eval_help.c 2006-07-24 08:50:50.000000000 -0400
+++ trec_eval.8.1.ndcg/trec_eval_help.c 2007-11-29 10:48:58.000000000 -0500
@@ -35,6 +35,10 @@
-Ub<num>: Value to use for 'b' coefficient of utility computation. \n\
-Uc<num>: Value to use for 'c' coefficient of utility computation. \n\
-Ud<num>: Value to use for 'd' coefficient of utility computation. \n\
+ -G<rel>=<num>[,<rel>=<num>]...: Gain values for normalized discounted \n\
+ cumulative gain. <rel> is a nonnegative integer relevance level, \n\
+ <num> is the gain value which may be floating-point. \n\
+ The default gain value is the value of that relevance level. \n\
-J: Calculate all values only over the judged (either relevant or \n\
nonrelevant) documents. All unjudged documents are removed from the \n\
retrieved set before any calculations (possibly leaving an empty set). \n\
diff -uN trec_eval.8.1/trvec_teval.c trec_eval.8.1.ndcg/trvec_teval.c
--- trec_eval.8.1/trvec_teval.c 2006-06-22 13:23:59.000000000 -0400
+++ trec_eval.8.1.ndcg/trvec_teval.c 2007-11-29 15:15:29.000000000 -0500
@@ -13,28 +13,22 @@
static int compare_iter_rank();
static void calc_cutoff_measures(EVAL_PARAM_INFO *epi, TR_VEC *tr_vec,
- TREC_EVAL *eval, long num_rel,
- long num_nonrel);
+ TREC_EVAL *eval, TREC_QRELS *qrels);
static void calc_bpref_measures(EVAL_PARAM_INFO *epi, TR_VEC *tr_vec,
- TREC_EVAL *eval, long num_rel,
- long num_nonrel);
+ TREC_EVAL *eval, TREC_QRELS *qrels);
static void calc_average_measures(EVAL_PARAM_INFO *epi, TR_VEC *tr_vec,
- TREC_EVAL *eval, long num_rel,
- long num_nonrel);
+ TREC_EVAL *eval, TREC_QRELS *qrels);
static void calc_exact_measures(EVAL_PARAM_INFO *epi, TR_VEC *tr_vec,
- TREC_EVAL *eval, long num_rel,
- long num_nonrel);
+ TREC_EVAL *eval, TREC_QRELS *qrels);
static void calc_time_measures(EVAL_PARAM_INFO *epi, TR_VEC *tr_vec,
- TREC_EVAL *eval, long num_rel,
- long num_nonrel);
+ TREC_EVAL *eval, TREC_QRELS *qrels);
int
-trvec_trec_eval (epi, tr_vec, eval, num_rel, num_nonrel)
+trvec_trec_eval (epi, tr_vec, eval, qrels)
EVAL_PARAM_INFO *epi;
TR_VEC *tr_vec;
TREC_EVAL *eval;
-long num_rel; /* Number relevant judged */
-long num_nonrel; /* Number nonrelevant judged */
+TREC_QRELS *qrels;
{
long j;
long max_iter;
@@ -42,9 +36,6 @@
if (tr_vec == (TR_VEC *) NULL)
return (UNDEF);
- /* Initialize everything to 0 */
- bzero ((char *) eval, sizeof (TREC_EVAL));
-
eval->qid = tr_vec->qid;
eval->num_queries = 1;
@@ -53,8 +44,6 @@
return (0);
}
- eval->num_rel = num_rel;
-
/* Evaluate only the docs on the last iteration of new_tr_vec */
/* Sort the tr tuples for this query by decreasing iter and
increasing rank */
@@ -78,20 +67,20 @@
/* Calculate cutoff measures, and those measures dependant on them */
/* Also includes recip_rank and rank_first_rel */
- calc_cutoff_measures (epi, tr_vec, eval, num_rel, num_nonrel);
+ calc_cutoff_measures (epi, tr_vec, eval, qrels);
/* Calculate bpref and related measures (judged docs only) */
- calc_bpref_measures (epi, tr_vec, eval, num_rel, num_nonrel);
+ calc_bpref_measures (epi, tr_vec, eval, qrels);
/* Calculate measures that average over ret or rel docs */
- calc_average_measures (epi, tr_vec, eval, num_rel, num_nonrel);
+ calc_average_measures (epi, tr_vec, eval, qrels);
/* Calculate exact measures over entire retrieved sets */
- calc_exact_measures (epi, tr_vec, eval, num_rel, num_nonrel);
+ calc_exact_measures (epi, tr_vec, eval, qrels);
/* Calculate time measures, if wanted */
if (epi->time_flag)
- calc_time_measures (epi, tr_vec, eval, num_rel, num_nonrel);
+ calc_time_measures (epi, tr_vec, eval, qrels);
return (1);
@@ -122,12 +111,11 @@
static void
-calc_cutoff_measures(epi, tr_vec, eval, num_rel, num_nonrel)
+calc_cutoff_measures(epi, tr_vec, eval, qrels)
EVAL_PARAM_INFO *epi;
TR_VEC *tr_vec;
TREC_EVAL *eval;
-long num_rel; /* Number relevant judged */
-long num_nonrel; /* Number nonrelevant judged */
+TREC_QRELS *qrels;
{
double recall, precis; /* current recall, precision values */
double rel_precis, rel_uap;/* relative precision, uap values */
@@ -292,14 +280,14 @@
}
static void
-calc_bpref_measures (epi, tr_vec, eval, num_rel, num_nonrel)
+calc_bpref_measures (epi, tr_vec, eval, qrels)
EVAL_PARAM_INFO *epi;
TR_VEC *tr_vec;
TREC_EVAL *eval;
-long num_rel; /* Number relevant judged */
-long num_nonrel; /* Number nonrelevant judged */
+TREC_QRELS *qrels;
{
long j;
+ long num_nonrel;
long nonrel_ret;
long nonrel_so_far, rel_so_far, pool_unjudged_so_far;
long bounded_5R_nonrel_so_far, bounded_10R_nonrel_so_far;
@@ -310,6 +298,8 @@
long pref_top_10pRnonrel_num;
long pref_top_Rnonrel_num;
+ num_nonrel = qrels->num_text_qrels - eval->num_rel;
+
/* Calculate judgement based measures (dependent on only
judged docs; no assumption of non-relevance if not judged) */
/* Binary Preference measures; here expressed as all docs with a higher
@@ -469,19 +459,22 @@
}
static void
-calc_average_measures (epi, tr_vec, eval, num_rel, num_nonrel)
+calc_average_measures (epi, tr_vec, eval, qrels)
EVAL_PARAM_INFO *epi;
TR_VEC *tr_vec;
TREC_EVAL *eval;
-long num_rel; /* Number relevant judged */
-long num_nonrel; /* Number nonrelevant judged */
+TREC_QRELS *qrels;
{
double recall, precis; /* current recall, precision values */
double rel_precis, rel_uap;/* relative precision, uap values */
double int_precis; /* current interpolated precision values */
-
+ double ideal_dcg; /* ideal discounted cumulative gain */
+ double gain;
+ long rel;
+
long i,j;
long rel_so_far;
+ long cur_lvl, lvl_count;
/* Note for interpolated precision values (Prec(X) = MAX (PREC(Y)) for all
Y >= X) */
@@ -516,10 +509,19 @@
eval->int_av_R_precis += int_precis;
}
- if (tr_vec->tr[j-1].rel >= epi->relevance_level) {
+ rel = tr_vec->tr[j-1].rel;
+ if (rel >= epi->relevance_level) {
eval->int_av_recall_precis += int_precis;
eval->av_recall_precis += precis;
eval->avg_doc_prec += precis;
+
+ if (rel >= epi->max_gains || rel < 0)
+ gain = 0;
+ else
+ gain = epi->gain[rel];
+ if (gain > 0) {
+ eval->norm_disc_cum_gain += gain / log2(j + 1);
+ }
rel_so_far--;
}
else {
@@ -549,10 +551,34 @@
}
}
+ /* Calculate ideal discounted cumulative gain for this topic */
+ /* The algorithm is to consider the ideal ordering of gains for
+ all documents, and to walk through that ordering. */
+ cur_lvl = epi->max_gains;
+ lvl_count = 0;
+ for (j = 1; j <= epi->max_num_docs_per_topic; j++) {
+ while (lvl_count == 0 && cur_lvl > 0) {
+ cur_lvl--;
+ if (epi->gain_order[cur_lvl] < qrels->max_num_rel_levels)
+ lvl_count = qrels->rel_count[epi->gain_order[cur_lvl]];
+ }
+ if (cur_lvl == 0)
+ break;
+ lvl_count--;
+
+ if (epi->gain_order[cur_lvl] < epi->relevance_level)
+ gain = 0;
+ else
+ gain = epi->gain[epi->gain_order[cur_lvl]];
+
+ ideal_dcg += gain / log2(j + 1);
+ }
+
/* Calculate all the other averages */
if (eval->num_rel_ret > 0) {
eval->av_recall_precis /= eval->num_rel;
eval->int_av_recall_precis /= eval->num_rel;
+ eval->norm_disc_cum_gain /= ideal_dcg;
}
eval->av_fall_recall /= MAX_FALL_RET;
@@ -574,12 +600,11 @@
}
static void
-calc_exact_measures (epi, tr_vec, eval, num_rel, num_nonrel)
+calc_exact_measures (epi, tr_vec, eval, qrels)
EVAL_PARAM_INFO *epi;
TR_VEC *tr_vec;
TREC_EVAL *eval;
-long num_rel; /* Number relevant judged */
-long num_nonrel; /* Number nonrelevant judged */
+TREC_QRELS *qrels;
{
if (eval->num_rel) {
@@ -603,12 +628,11 @@
}
static void
-calc_time_measures (epi, tr_vec, eval, num_rel, num_nonrel)
+calc_time_measures (epi, tr_vec, eval, qrels)
EVAL_PARAM_INFO *epi;
TR_VEC *tr_vec;
TREC_EVAL *eval;
-long num_rel; /* Number relevant judged */
-long num_nonrel; /* Number nonrelevant judged */
+TREC_QRELS *qrels;
{
double recall, precis; /* current recall, precision values */
double rel_precis, rel_uap;/* relative precision, uap values */
- References:
- Re: NDCG
- From: Ian Soboroff <ian.soboroff@nist.gov>
Date Index |
Thread Index |
Problems or questions? Contact list-master@nist.gov