movtextdec: fix handling of UTF-8 subtitles

Subtitles which contained styled UTF-8 subtitles (i.e. not just 7 bit ASCII characters) were not handled correctly. The spec mandates that styling start/end ranges are in "characters". It's not quite clear what a "character" is supposed to be, but maybe they mean unicode codepoints. FFmpeg's decoder treated the style ranges as byte idexes, which could lead to UTF-8 sequences being broken, and the common code dropping the whole subtitle line. Change this and count the codepoint instead. This also means that even if this is somehow wrong, the decoder won't break UTF-8 sequences anymore. The sample which led me to investigate this now appears to work correctly.
author: wm4 <nfxjfg@googlemail.com> 2018-03-24 15:41:54 +0100
committer: wm4 <nfxjfg@googlemail.com> 2018-03-25 19:27:02 +0200
commit: b0644c3e1a96397ee5e2448c542fa4c3bc319537 (patch)
tree: b08e9604d82001257ba73bdc4626953c6690b58b
parent: b7d0d912ef9b60eae962e4622d72860af31a8b00 (diff)
1 files changed, 37 insertions, 13 deletions
diff --git a/libavcodec/movtextdec.c b/libavcodec/movtextdec.c
index bd19577724..89ac791602 100644
--- a/libavcodec/movtextdec.c
+++ b/libavcodec/movtextdec.c
@@ -326,9 +326,24 @@ static const Box box_types[] = {
 
 const static size_t box_count = FF_ARRAY_ELEMS(box_types);
 
+// Return byte length of the UTF-8 sequence starting at text[0]. 0 on error.
+static int get_utf8_length_at(const char *text, const char *text_end)
+{
+    const char *start = text;
+    int err = 0;
+    uint32_t c;
+    GET_UTF8(c, text < text_end ? (uint8_t)*text++ : (err = 1, 0), goto error;);
+    if (err)
+        goto error;
+    return text - start;
+error:
+    return 0;
+}
+
 static int text_to_ass(AVBPrint *buf, const char *text, const char *text_end,
-                        MovTextContext *m)
+                       AVCodecContext *avctx)
 {
+    MovTextContext *m = avctx->priv_data;
     int i = 0;
     int j = 0;
     int text_pos = 0;
@@ -342,6 +357,8 @@ static int text_to_ass(AVBPrint *buf, const char *text, const char *text_end,
     }
 
     while (text < text_end) {
+        int len;
+
         if (m->box_flags & STYL_BOX) {
             for (i = 0; i < m->style_entries; i++) {
                 if (m->s[i]->style_flag && text_pos == m->s[i]->style_end) {
@@ -388,17 +405,24 @@ static int text_to_ass(AVBPrint *buf, const char *text, const char *text_end,
             }
         }
 
-        switch (*text) {
-        case '\r':
-            break;
-        case '\n':
-            av_bprintf(buf, "\\N");
-            break;
-        default:
-            av_bprint_chars(buf, *text, 1);
-            break;
+        len = get_utf8_length_at(text, text_end);
+        if (len < 1) {
+            av_log(avctx, AV_LOG_ERROR, "invalid UTF-8 byte in subtitle\n");
+            len = 1;
+        }
+        for (i = 0; i < len; i++) {
+            switch (*text) {
+            case '\r':
+                break;
+            case '\n':
+                av_bprintf(buf, "\\N");
+                break;
+            default:
+                av_bprint_chars(buf, *text, 1);
+                break;
+            }
+            text++;
         }
-        text++;
         text_pos++;
     }
 
@@ -507,10 +531,10 @@ static int mov_text_decode_frame(AVCodecContext *avctx,
             }
             m->tracksize = m->tracksize + tsmb_size;
         }
-        text_to_ass(&buf, ptr, end, m);
+        text_to_ass(&buf, ptr, end, avctx);
         mov_text_cleanup(m);
     } else
-        text_to_ass(&buf, ptr, end, m);
+        text_to_ass(&buf, ptr, end, avctx);
 
     ret = ff_ass_add_rect(sub, buf.str, m->readorder++, 0, NULL, NULL);
     av_bprint_finalize(&buf, NULL);
author	wm4 <nfxjfg@googlemail.com>	2018-03-24 15:41:54 +0100
committer	wm4 <nfxjfg@googlemail.com>	2018-03-25 19:27:02 +0200
commit	b0644c3e1a96397ee5e2448c542fa4c3bc319537 (patch)
tree	b08e9604d82001257ba73bdc4626953c6690b58b
parent	b7d0d912ef9b60eae962e4622d72860af31a8b00 (diff)