diff --git a/docs/CHANGES.TXT b/docs/CHANGES.TXT
index 27638a925..0703edf92 100644
--- a/docs/CHANGES.TXT
+++ b/docs/CHANGES.TXT
@@ -6,6 +6,7 @@
- New: Added more fatals on no arguments errors.
- New: Updated ccextractor.cnf.sample.
- New: Add support for censoring words ("Kid Friendly") (#1139)
+- New: Extend support of capitalization for all BITMAP and 608 subtitles (#1214)
- Fix: ccx_demuxer_mxf.c: Parse framerate from MXF captions to fix caption timings.
- Fix: hardsubx_decoder.c: Fix memory leaks using Leptonica API.
- Fix: linux/Makefile.am: added some sources to enable rpms to be created.
diff --git a/src/lib_ccx/ccx_encoders_common.c b/src/lib_ccx/ccx_encoders_common.c
index 21e136b79..f1b6cbb4a 100644
--- a/src/lib_ccx/ccx_encoders_common.c
+++ b/src/lib_ccx/ccx_encoders_common.c
@@ -615,8 +615,6 @@ void write_cc_line_as_simplexml(struct eia608_screen *data, struct encoder_ctx *
char *cap = "
";
char *cap1 = "";
- correct_spelling_and_censor_words_608(context, line_number, data);
-
length = get_str_basic(context->subline, data->characters[line_number],
context->trim_subs, CCX_ENC_ASCII, context->encoding, CCX_DECODER_608_SCREEN_WIDTH);
@@ -1155,6 +1153,9 @@ int encode_sub(struct encoder_ctx *context, struct cc_subtitle *sub)
data->end_time += utc_refvalue * 1000;
}
+ for (int i = 0; i < CCX_DECODER_608_SCREEN_ROWS; ++i)
+ correct_spelling_and_censor_words(context, (char *) data->characters[i], CCX_DECODER_608_SCREEN_WIDTH);
+
#ifdef PYTHON_API
pass_cc_buffer_to_python(data, context);
#else
@@ -1228,7 +1229,26 @@ int encode_sub(struct encoder_ctx *context, struct cc_subtitle *sub)
}
freep(&sub->data);
break;
- case CC_BITMAP:
+ case CC_BITMAP:;
+
+#ifdef ENABLE_OCR
+ struct cc_bitmap *rect;
+ int i;
+ for (i = 0, rect = sub->data; i < sub->nb_data; ++i, ++rect)
+ {
+ if (rect->ocr_text)
+ {
+ int len = strlen(rect->ocr_text);
+ correct_spelling_and_censor_words(context, rect->ocr_text, len);
+ for (int i = 0; i < len; ++i)
+ {
+ if ((unsigned char)rect->ocr_text[i] == 0x98) // asterisk in 608 encoding
+ rect->ocr_text[i] = '*';
+ }
+ }
+ }
+#endif
+
switch (context->write_format)
{
case CCX_OF_CCD:
@@ -1281,7 +1301,6 @@ int encode_sub(struct encoder_ctx *context, struct cc_subtitle *sub)
default:
break;
}
-
break;
case CC_RAW:
if (context->send_to_srv)
diff --git a/src/lib_ccx/ccx_encoders_helpers.c b/src/lib_ccx/ccx_encoders_helpers.c
index edcbb8f09..3dd63de10 100644
--- a/src/lib_ccx/ccx_encoders_helpers.c
+++ b/src/lib_ccx/ccx_encoders_helpers.c
@@ -61,7 +61,7 @@ const char *profane_builtin[] =
"goddamn",
"godsdamn",
"hell",
- "holy shit",
+ "holy",
"horseshit",
"motherfucker",
"nigga",
@@ -84,17 +84,17 @@ int string_cmp(const void *p1, const void *p2)
return string_cmp_function(p1, p2, NULL);
}
-void capitalize_word(size_t index, unsigned char *word)
+void capitalize_word(size_t index, char *word)
{
memcpy(word, capitalization_list.words[index], strlen(capitalization_list.words[index]));
}
-void censor_word(size_t index, unsigned char *word)
+void censor_word(size_t index, char *word)
{
memset(word, 0x98, strlen(profane.words[index])); // 0x98 is the asterisk in EIA-608
}
-void call_function_if_match(int line_num, struct eia608_screen *data, struct word_list *list, void (*modification)(size_t, unsigned char *))
+void call_function_if_match(char *line, struct word_list *list, void (*modification)(size_t, char *))
{
char delim[64] = {
' ', '\n', '\r', 0x89, 0x99,
@@ -105,8 +105,8 @@ void call_function_if_match(int line_num, struct eia608_screen *data, struct wor
'.', '/', ':', '^', '_',
'{', '|', '}', '~', '\0' };
- char *line = strdup(data->characters[line_num]);
- char *c = strtok(line, delim);
+ char *line_token = strdup(line);
+ char *c = strtok(line_token, delim);
if (c != NULL)
{
@@ -116,21 +116,11 @@ void call_function_if_match(int line_num, struct eia608_screen *data, struct wor
if (index)
{
- modification(index - list->words, data->characters[line_num] + (c - line));
+ modification(index - list->words, line + (c - line_token));
}
} while ((c = strtok(NULL, delim)) != NULL);
}
- free(line);
-}
-
-void correct_case_with_dictionary(int line_num, struct eia608_screen *data)
-{
- call_function_if_match(line_num, data, &capitalization_list, capitalize_word);
-}
-
-void censor_word_with_dictionary(int line_num, struct eia608_screen *data)
-{
- call_function_if_match(line_num, data, &profane, censor_word);
+ free(line_token);
}
void telx_correct_case(char *sub_line)
@@ -180,7 +170,7 @@ int is_all_caps(struct encoder_ctx *context, int line_num, struct eia608_screen
return (saw_upper && !saw_lower); // 1 if we've seen upper and not lower, 0 otherwise
}
-int clever_capitalize(struct encoder_ctx *context, int line_num, struct eia608_screen *data)
+int clever_capitalize(struct encoder_ctx *context, char *line, unsigned int length)
{
// CFS: Tried doing to clever (see below) but some channels do all uppercase except for
// notes for deaf people (such as "(narrator)" which messes things up.
@@ -188,9 +178,9 @@ int clever_capitalize(struct encoder_ctx *context, int line_num, struct eia608_s
//int doit = is_all_caps(context, line_num, data);
int doit = 1;
- for (int i = 0; i < CCX_DECODER_608_SCREEN_WIDTH; i++)
+ for (int i = 0; i < length; i++)
{
- switch (data->characters[line_num][i])
+ switch (line[i])
{
case ' ':
case 0x89: // This is a transparent space
@@ -206,9 +196,9 @@ int clever_capitalize(struct encoder_ctx *context, int line_num, struct eia608_s
if (doit)
{
if (context->new_sentence)
- data->characters[line_num][i] = cctoupper(data->characters[line_num][i]);
+ line[i] = cctoupper(line[i]);
else
- data->characters[line_num][i] = cctolower(data->characters[line_num][i]);
+ line[i] = cctolower(line[i]);
}
context->new_sentence = 0;
break;
@@ -455,17 +445,17 @@ int add_builtin_words(const char *builtin[], struct word_list *list)
return 0;
}
-void correct_spelling_and_censor_words_608(struct encoder_ctx *context, int line_number, struct eia608_screen *data)
+void correct_spelling_and_censor_words(struct encoder_ctx *context, char *line, unsigned int length)
{
if (context->sentence_cap)
{
- if (clever_capitalize(context, line_number, data))
- correct_case_with_dictionary(line_number, data);
+ if (clever_capitalize(context, line, length))
+ call_function_if_match(line, &capitalization_list, capitalize_word);
}
if (context->filter_profanity)
{
- censor_word_with_dictionary(line_number, data);
+ call_function_if_match(line, &profane, censor_word);
}
}
diff --git a/src/lib_ccx/ccx_encoders_helpers.h b/src/lib_ccx/ccx_encoders_helpers.h
index fef23051f..0396c5e03 100644
--- a/src/lib_ccx/ccx_encoders_helpers.h
+++ b/src/lib_ccx/ccx_encoders_helpers.h
@@ -23,9 +23,7 @@ struct ccx_encoders_helpers_settings_t {
};
// Helper functions
-void correct_case_with_dictionary(int line_num, struct eia608_screen *data);
int is_all_caps(struct encoder_ctx *context, int line_num, struct eia608_screen *data);
-int clever_capitalize(struct encoder_ctx *context, int line_num, struct eia608_screen *data);
void telx_correct_case(char *sub_line);
unsigned get_decoder_line_encoded_for_gui(unsigned char *buffer, int line_num, struct eia608_screen *data);
unsigned get_decoder_line_encoded(struct encoder_ctx *ctx, unsigned char *buffer, int line_num, struct eia608_screen *data);
@@ -36,7 +34,7 @@ int string_cmp_function(const void *p1, const void *p2, void *arg);
int add_word(struct word_list *list, const char *word);
int add_builtin_words(const char *builtin[], struct word_list *list);
-void correct_spelling_and_censor_words_608(struct encoder_ctx *context, int line_number, struct eia608_screen *data);
+void correct_spelling_and_censor_words(struct encoder_ctx *context, char *line, unsigned int length);
unsigned encode_line (struct encoder_ctx *ctx, unsigned char *buffer, unsigned char *text);
diff --git a/src/lib_ccx/ccx_encoders_smptett.c b/src/lib_ccx/ccx_encoders_smptett.c
index f32603ad6..1c2c9e46b 100644
--- a/src/lib_ccx/ccx_encoders_smptett.c
+++ b/src/lib_ccx/ccx_encoders_smptett.c
@@ -201,8 +201,6 @@ int write_cc_buffer_as_smptett(struct eia608_screen *data, struct encoder_ctx *c
{
if (data->row_used[row])
{
- correct_spelling_and_censor_words_608(context, row, data);
-
float row1=0;
float col1=0;
int firstcol=-1;
diff --git a/src/lib_ccx/ccx_encoders_splitbysentence.c b/src/lib_ccx/ccx_encoders_splitbysentence.c
index 45d600f7a..c06946232 100644
--- a/src/lib_ccx/ccx_encoders_splitbysentence.c
+++ b/src/lib_ccx/ccx_encoders_splitbysentence.c
@@ -762,7 +762,7 @@ struct cc_subtitle * reformat_cc_bitmap_through_sentence_buffer(struct cc_subtit
if (sub->flags & SUB_EOD_MARKER)
context->prev_start = sub->start_time;
- str = paraof_ocrtext(sub, context->encoded_crlf, context->encoded_crlf_length);
+ str = paraof_ocrtext(sub, context);
if (str)
{
diff --git a/src/lib_ccx/ccx_encoders_spupng.c b/src/lib_ccx/ccx_encoders_spupng.c
index 4c69c71fa..67bc06899 100644
--- a/src/lib_ccx/ccx_encoders_spupng.c
+++ b/src/lib_ccx/ccx_encoders_spupng.c
@@ -449,7 +449,7 @@ int write_cc_bitmap_as_spupng(struct cc_subtitle *sub, struct encoder_ctx *conte
if (!context->nospupngocr)
{
char *str;
- str = paraof_ocrtext(sub, context->encoded_crlf, context->encoded_crlf_length);
+ str = paraof_ocrtext(sub, context);
if (str)
{
write_spucomment(sp, str);
diff --git a/src/lib_ccx/ccx_encoders_srt.c b/src/lib_ccx/ccx_encoders_srt.c
index dc1a9c2a7..c95e038e2 100644
--- a/src/lib_ccx/ccx_encoders_srt.c
+++ b/src/lib_ccx/ccx_encoders_srt.c
@@ -97,7 +97,7 @@ int write_cc_bitmap_as_srt(struct cc_subtitle *sub, struct encoder_ctx *context)
if(sub->flags & SUB_EOD_MARKER)
context->prev_start = sub->start_time;
- str = paraof_ocrtext(sub, context->encoded_crlf, context->encoded_crlf_length);
+ str = paraof_ocrtext(sub, context);
if (str)
{
if(context->is_mkv == 1) {
@@ -210,8 +210,6 @@ int write_cc_buffer_as_srt(struct eia608_screen *data, struct encoder_ctx *conte
{
if (data->row_used[i])
{
- correct_spelling_and_censor_words_608(context, i, data);
-
if (context->autodash && context->trim_subs)
{
int first=0, last=31, center1=-1, center2=-1;
diff --git a/src/lib_ccx/ccx_encoders_ssa.c b/src/lib_ccx/ccx_encoders_ssa.c
index 104ec8ba3..5f39bdc22 100644
--- a/src/lib_ccx/ccx_encoders_ssa.c
+++ b/src/lib_ccx/ccx_encoders_ssa.c
@@ -91,7 +91,7 @@ int write_cc_bitmap_as_ssa(struct cc_subtitle *sub, struct encoder_ctx *context)
if(sub->flags & SUB_EOD_MARKER)
context->prev_start = sub->start_time;
- str = paraof_ocrtext(sub, context->encoded_crlf, context->encoded_crlf_length);
+ str = paraof_ocrtext(sub, context);
if (str)
{
// SSA format - change "\r\n" to "\N"
@@ -194,8 +194,6 @@ int write_cc_buffer_as_ssa(struct eia608_screen *data, struct encoder_ctx *conte
{
if (data->row_used[i])
{
- correct_spelling_and_censor_words_608(context, i, data);
-
if (context->autodash && context->trim_subs)
{
int first=0, last=31, center1=-1, center2=-1;
diff --git a/src/lib_ccx/ccx_encoders_transcript.c b/src/lib_ccx/ccx_encoders_transcript.c
index 756f09c81..c62d64db8 100644
--- a/src/lib_ccx/ccx_encoders_transcript.c
+++ b/src/lib_ccx/ccx_encoders_transcript.c
@@ -32,7 +32,7 @@ int write_cc_bitmap_as_transcript(struct cc_subtitle *sub, struct encoder_ctx *c
if (context->prev_start != -1 || !(sub->flags & SUB_EOD_MARKER))
{
char *token = NULL;
- token = paraof_ocrtext(sub, context->encoded_crlf, context->encoded_crlf_length);
+ token = paraof_ocrtext(sub, context);
if (context->transcript_settings->showStartTime)
{
char buf1[80];
@@ -239,9 +239,6 @@ int write_cc_subtitle_as_transcript(struct cc_subtitle *sub, struct encoder_ctx
void write_cc_line_as_transcript2(struct eia608_screen *data, struct encoder_ctx *context, int line_number)
{
int ret = 0;
-
- correct_spelling_and_censor_words_608(context, line_number, data);
-
int length = get_str_basic(context->subline, data->characters[line_number],
context->trim_subs, CCX_ENC_ASCII, context->encoding, CCX_DECODER_608_SCREEN_WIDTH);
diff --git a/src/lib_ccx/ccx_encoders_webvtt.c b/src/lib_ccx/ccx_encoders_webvtt.c
index 98a8c8d8a..21b954a95 100644
--- a/src/lib_ccx/ccx_encoders_webvtt.c
+++ b/src/lib_ccx/ccx_encoders_webvtt.c
@@ -284,7 +284,7 @@ int write_cc_bitmap_as_webvtt(struct cc_subtitle *sub, struct encoder_ctx *conte
if (sub->flags & SUB_EOD_MARKER)
context->prev_start = sub->start_time;
- str = paraof_ocrtext(sub, context->encoded_crlf, context->encoded_crlf_length);
+ str = paraof_ocrtext(sub, context);
if (str)
{
if (context->prev_start != -1 || !(sub->flags & SUB_EOD_MARKER))
diff --git a/src/lib_ccx/ocr.c b/src/lib_ccx/ocr.c
index 3f9df065a..e8b165d4c 100644
--- a/src/lib_ccx/ocr.c
+++ b/src/lib_ccx/ocr.c
@@ -976,7 +976,7 @@ void add_ocrtext2str(char *dest, char *src, const unsigned char *crlf, unsigned
* for all text detected from rectangles
*/
-char *paraof_ocrtext(struct cc_subtitle *sub, const unsigned char *crlf, unsigned crlf_length)
+char *paraof_ocrtext(struct cc_subtitle *sub, struct encoder_ctx *context)
{
int i;
int len = 0;
@@ -1002,7 +1002,7 @@ char *paraof_ocrtext(struct cc_subtitle *sub, const unsigned char *crlf, unsigne
for(i = 0, rect = sub->data; i < sub->nb_data; i++, rect++)
{
if (!rect->ocr_text) continue;
- add_ocrtext2str(str, rect->ocr_text, crlf, crlf_length);
+ add_ocrtext2str(str, rect->ocr_text, context->encoded_crlf, context->encoded_crlf_length);
free(rect->ocr_text);
}
return str;
diff --git a/src/lib_ccx/ocr.h b/src/lib_ccx/ocr.h
index 958be4940..f7332cb0f 100644
--- a/src/lib_ccx/ocr.h
+++ b/src/lib_ccx/ocr.h
@@ -15,6 +15,6 @@ void delete_ocr (void** arg);
void* init_ocr(int lang_index);
char* ocr_bitmap(void* arg, png_color *palette,png_byte *alpha, unsigned char* indata,int w, int h, struct image_copy *copy);
int ocr_rect(void* arg, struct cc_bitmap *rect, char **str, int bgcolor, int ocr_quantmode);
-char *paraof_ocrtext(struct cc_subtitle *sub, const unsigned char *crlf, unsigned crlf_length);
+char *paraof_ocrtext(struct cc_subtitle *sub, struct encoder_ctx *context);
#endif