From 5ab21439fdfb8e239eb778128590d95967067b46 Mon Sep 17 00:00:00 2001
From: Joseph Artsimovich <joseph@mirriad.com>
Date: Wed, 20 Jul 2011 18:58:27 +0100
Subject: dnxhd: 10-bit support

Signed-off-by: Mans Rullgard <mans@mansr.com>
---
 libavcodec/dnxhdenc.c | 240 ++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 184 insertions(+), 56 deletions(-)

(limited to 'libavcodec/dnxhdenc.c')
diff --git a/libavcodec/dnxhdenc.c b/libavcodec/dnxhdenc.c
index b65d0bf669..3fb10d4380 100644
--- a/libavcodec/dnxhdenc.c
+++ b/libavcodec/dnxhdenc.c
@@ -1,8 +1,10 @@
 /*
  * VC3/DNxHD encoder
  * Copyright (c) 2007 Baptiste Coudurier <baptiste dot coudurier at smartjog dot com>
+ * Copyright (c) 2011 MirriAd Ltd
  *
  * VC-3 encoder funded by the British Broadcasting Corporation
+ * 10 bit support added by MirriAd Ltd, Joseph Artsimovich <joseph@mirriad.com>
  *
  * This file is part of Libav.
  *
@@ -32,6 +34,7 @@
 #include "dnxhdenc.h"
 
 #define VE AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM
+#define DNX10BIT_QMAT_SHIFT 18 // The largest value that will not lead to overflow for 10bit samples.
 
 static const AVOption options[]={
     {"nitris_compat", "encode with Avid Nitris compatibility", offsetof(DNXHDEncContext, nitris_compat), FF_OPT_TYPE_INT, {.dbl = 0}, 0, 1, VE},
@@ -41,7 +44,7 @@ static const AVClass class = { "dnxhd", av_default_item_name, options, LIBAVUTIL
 
 #define LAMBDA_FRAC_BITS 10
 
-static void dnxhd_get_pixels_8x4(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
+static void dnxhd_8bit_get_pixels_8x4_sym(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
 {
     int i;
     for (i = 0; i < 4; i++) {
@@ -58,6 +61,43 @@ static void dnxhd_get_pixels_8x4(DCTELEM *restrict block, const uint8_t *pixels,
     memcpy(block + 24, block - 32, sizeof(*block) * 8);
 }
 
+static av_always_inline void dnxhd_10bit_get_pixels_8x4_sym(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
+{
+    int i;
+
+    block += 32;
+
+    for (i = 0; i < 4; i++) {
+        memcpy(block + i     * 8, pixels + i * line_size, 8 * sizeof(*block));
+        memcpy(block - (i+1) * 8, pixels + i * line_size, 8 * sizeof(*block));
+    }
+}
+
+static int dnxhd_10bit_dct_quantize(MpegEncContext *ctx, DCTELEM *block,
+                                    int n, int qscale, int *overflow)
+{
+    const uint8_t *scantable= ctx->intra_scantable.scantable;
+    const int *qmat = ctx->q_intra_matrix[qscale];
+    int last_non_zero = 0;
+
+    ctx->dsp.fdct(block);
+
+    // Divide by 4 with rounding, to compensate scaling of DCT coefficients
+    block[0] = (block[0] + 2) >> 2;
+
+    for (int i = 1; i < 64; ++i) {
+        int j = scantable[i];
+        int sign = block[j] >> 31;
+        int level = (block[j] ^ sign) - sign;
+        level = level * qmat[j] >> DNX10BIT_QMAT_SHIFT;
+        block[j] = (level ^ sign) - sign;
+        if (level)
+            last_non_zero = i;
+    }
+
+    return last_non_zero;
+}
+
 static int dnxhd_init_vlc(DNXHDEncContext *ctx)
 {
     int i, j, level, run;
@@ -118,31 +158,55 @@ static int dnxhd_init_qmat(DNXHDEncContext *ctx, int lbias, int cbias)
     // init first elem to 1 to avoid div by 0 in convert_matrix
     uint16_t weight_matrix[64] = {1,}; // convert_matrix needs uint16_t*
     int qscale, i;
+    const uint8_t *luma_weight_table   = ctx->cid_table->luma_weight;
+    const uint8_t *chroma_weight_table = ctx->cid_table->chroma_weight;
 
     FF_ALLOCZ_OR_GOTO(ctx->m.avctx, ctx->qmatrix_l,   (ctx->m.avctx->qmax+1) * 64 *     sizeof(int),      fail);
     FF_ALLOCZ_OR_GOTO(ctx->m.avctx, ctx->qmatrix_c,   (ctx->m.avctx->qmax+1) * 64 *     sizeof(int),      fail);
     FF_ALLOCZ_OR_GOTO(ctx->m.avctx, ctx->qmatrix_l16, (ctx->m.avctx->qmax+1) * 64 * 2 * sizeof(uint16_t), fail);
     FF_ALLOCZ_OR_GOTO(ctx->m.avctx, ctx->qmatrix_c16, (ctx->m.avctx->qmax+1) * 64 * 2 * sizeof(uint16_t), fail);
 
-    for (i = 1; i < 64; i++) {
-        int j = ctx->m.dsp.idct_permutation[ff_zigzag_direct[i]];
-        weight_matrix[j] = ctx->cid_table->luma_weight[i];
-    }
-    ff_convert_matrix(&ctx->m.dsp, ctx->qmatrix_l, ctx->qmatrix_l16, weight_matrix,
-                      ctx->m.intra_quant_bias, 1, ctx->m.avctx->qmax, 1);
-    for (i = 1; i < 64; i++) {
-        int j = ctx->m.dsp.idct_permutation[ff_zigzag_direct[i]];
-        weight_matrix[j] = ctx->cid_table->chroma_weight[i];
-    }
-    ff_convert_matrix(&ctx->m.dsp, ctx->qmatrix_c, ctx->qmatrix_c16, weight_matrix,
-                      ctx->m.intra_quant_bias, 1, ctx->m.avctx->qmax, 1);
-    for (qscale = 1; qscale <= ctx->m.avctx->qmax; qscale++) {
-        for (i = 0; i < 64; i++) {
-            ctx->qmatrix_l  [qscale]   [i] <<= 2; ctx->qmatrix_c  [qscale]   [i] <<= 2;
-            ctx->qmatrix_l16[qscale][0][i] <<= 2; ctx->qmatrix_l16[qscale][1][i] <<= 2;
-            ctx->qmatrix_c16[qscale][0][i] <<= 2; ctx->qmatrix_c16[qscale][1][i] <<= 2;
+    if (ctx->cid_table->bit_depth == 8) {
+        for (i = 1; i < 64; i++) {
+            int j = ctx->m.dsp.idct_permutation[ff_zigzag_direct[i]];
+            weight_matrix[j] = ctx->cid_table->luma_weight[i];
+        }
+        ff_convert_matrix(&ctx->m.dsp, ctx->qmatrix_l, ctx->qmatrix_l16, weight_matrix,
+                          ctx->m.intra_quant_bias, 1, ctx->m.avctx->qmax, 1);
+        for (i = 1; i < 64; i++) {
+            int j = ctx->m.dsp.idct_permutation[ff_zigzag_direct[i]];
+            weight_matrix[j] = ctx->cid_table->chroma_weight[i];
+        }
+        ff_convert_matrix(&ctx->m.dsp, ctx->qmatrix_c, ctx->qmatrix_c16, weight_matrix,
+                          ctx->m.intra_quant_bias, 1, ctx->m.avctx->qmax, 1);
+
+        for (qscale = 1; qscale <= ctx->m.avctx->qmax; qscale++) {
+            for (i = 0; i < 64; i++) {
+                ctx->qmatrix_l  [qscale]   [i] <<= 2; ctx->qmatrix_c  [qscale]   [i] <<= 2;
+                ctx->qmatrix_l16[qscale][0][i] <<= 2; ctx->qmatrix_l16[qscale][1][i] <<= 2;
+                ctx->qmatrix_c16[qscale][0][i] <<= 2; ctx->qmatrix_c16[qscale][1][i] <<= 2;
+            }
+        }
+    } else {
+        // 10-bit
+        for (qscale = 1; qscale <= ctx->m.avctx->qmax; qscale++) {
+            for (i = 1; i < 64; i++) {
+                int j = ctx->m.dsp.idct_permutation[ff_zigzag_direct[i]];
+
+                // The quantization formula from the VC-3 standard is:
+                // quantized = sign(block[i]) * floor(abs(block[i]/s) * p / (qscale * weight_table[i]))
+                // Where p is 32 for 8-bit samples and 8 for 10-bit ones.
+                // The s factor compensates scaling of DCT coefficients done by the DCT routines,
+                // and therefore is not present in standard.  It's 8 for 8-bit samples and 4 for 10-bit ones.
+                // We want values of ctx->qtmatrix_l and ctx->qtmatrix_r to be:
+                // ((1 << DNX10BIT_QMAT_SHIFT) * (p / s)) / (qscale * weight_table[i])
+                // For 10-bit samples, p / s == 2
+                ctx->qmatrix_l[qscale][j] = (1 << (DNX10BIT_QMAT_SHIFT + 1)) / (qscale * luma_weight_table[i]);
+                ctx->qmatrix_c[qscale][j] = (1 << (DNX10BIT_QMAT_SHIFT + 1)) / (qscale * chroma_weight_table[i]);
+            }
         }
     }
+
     return 0;
  fail:
     return -1;
@@ -165,10 +229,22 @@ static int dnxhd_init_rc(DNXHDEncContext *ctx)
 static int dnxhd_encode_init(AVCodecContext *avctx)
 {
     DNXHDEncContext *ctx = avctx->priv_data;
-    int i, index;
+    int i, index, bit_depth;
+
+    switch (avctx->pix_fmt) {
+    case PIX_FMT_YUV422P:
+        bit_depth = 8;
+        break;
+    case PIX_FMT_YUV422P10:
+        bit_depth = 10;
+        break;
+    default:
+        av_log(avctx, AV_LOG_ERROR, "pixel format is incompatible with DNxHD\n");
+        return -1;
+    }
 
-    ctx->cid = ff_dnxhd_find_cid(avctx);
-    if (!ctx->cid || avctx->pix_fmt != PIX_FMT_YUV422P) {
+    ctx->cid = ff_dnxhd_find_cid(avctx, bit_depth);
+    if (!ctx->cid) {
         av_log(avctx, AV_LOG_ERROR, "video parameters incompatible with DNxHD\n");
         return -1;
     }
@@ -181,15 +257,25 @@ static int dnxhd_encode_init(AVCodecContext *avctx)
     ctx->m.mb_intra = 1;
     ctx->m.h263_aic = 1;
 
-    ctx->get_pixels_8x4_sym = dnxhd_get_pixels_8x4;
+    avctx->bits_per_raw_sample = ctx->cid_table->bit_depth;
 
     dsputil_init(&ctx->m.dsp, avctx);
     ff_dct_common_init(&ctx->m);
+    if (!ctx->m.dct_quantize)
+        ctx->m.dct_quantize = dct_quantize_c;
+
+    if (ctx->cid_table->bit_depth == 10) {
+       ctx->m.dct_quantize = dnxhd_10bit_dct_quantize;
+       ctx->get_pixels_8x4_sym = dnxhd_10bit_get_pixels_8x4_sym;
+       ctx->block_width_l2 = 4;
+    } else {
+       ctx->get_pixels_8x4_sym = dnxhd_8bit_get_pixels_8x4_sym;
+       ctx->block_width_l2 = 3;
+    }
+
 #if HAVE_MMX
     ff_dnxhd_init_mmx(ctx);
 #endif
-    if (!ctx->m.dct_quantize)
-        ctx->m.dct_quantize = dct_quantize_c;
 
     ctx->m.mb_height = (avctx->height + 15) / 16;
     ctx->m.mb_width  = (avctx->width  + 15) / 16;
@@ -255,7 +341,7 @@ static int dnxhd_write_header(AVCodecContext *avctx, uint8_t *buf)
     AV_WB16(buf + 0x1a, avctx->width);  // SPL
     AV_WB16(buf + 0x1d, avctx->height>>ctx->interlaced); // NAL
 
-    buf[0x21] = 0x38; // FIXME 8 bit per comp
+    buf[0x21] = ctx->cid_table->bit_depth == 10 ? 0x58 : 0x38;
     buf[0x22] = 0x88 + (ctx->interlaced<<2);
     AV_WB32(buf + 0x28, ctx->cid); // CID
     buf[0x2c] = ctx->interlaced ? 0 : 0x80;
@@ -321,15 +407,27 @@ static av_always_inline void dnxhd_unquantize_c(DNXHDEncContext *ctx, DCTELEM *b
         if (level) {
             if (level < 0) {
                 level = (1-2*level) * qscale * weight_matrix[i];
-                if (weight_matrix[i] != 32)
-                    level += 32;
-                level >>= 6;
+                if (ctx->cid_table->bit_depth == 10) {
+                    if (weight_matrix[i] != 8)
+                        level += 8;
+                    level >>= 4;
+                } else {
+                    if (weight_matrix[i] != 32)
+                        level += 32;
+                    level >>= 6;
+                }
                 level = -level;
             } else {
                 level = (2*level+1) * qscale * weight_matrix[i];
-                if (weight_matrix[i] != 32)
-                    level += 32;
-                level >>= 6;
+                if (ctx->cid_table->bit_depth == 10) {
+                    if (weight_matrix[i] != 8)
+                        level += 8;
+                    level >>= 4;
+                } else {
+                    if (weight_matrix[i] != 32)
+                        level += 32;
+                    level >>= 6;
+                }
             }
             block[j] = level;
         }
@@ -364,22 +462,24 @@ static av_always_inline int dnxhd_calc_ac_bits(DNXHDEncContext *ctx, DCTELEM *bl
 
 static av_always_inline void dnxhd_get_blocks(DNXHDEncContext *ctx, int mb_x, int mb_y)
 {
-    const uint8_t *ptr_y = ctx->thread[0]->src[0] + ((mb_y << 4) * ctx->m.linesize)   + (mb_x << 4);
-    const uint8_t *ptr_u = ctx->thread[0]->src[1] + ((mb_y << 4) * ctx->m.uvlinesize) + (mb_x << 3);
-    const uint8_t *ptr_v = ctx->thread[0]->src[2] + ((mb_y << 4) * ctx->m.uvlinesize) + (mb_x << 3);
+    const int bs = ctx->block_width_l2;
+    const int bw = 1 << bs;
+    const uint8_t *ptr_y = ctx->thread[0]->src[0] + ((mb_y << 4) * ctx->m.linesize)   + (mb_x << bs+1);
+    const uint8_t *ptr_u = ctx->thread[0]->src[1] + ((mb_y << 4) * ctx->m.uvlinesize) + (mb_x << bs);
+    const uint8_t *ptr_v = ctx->thread[0]->src[2] + ((mb_y << 4) * ctx->m.uvlinesize) + (mb_x << bs);
     DSPContext *dsp = &ctx->m.dsp;
 
-    dsp->get_pixels(ctx->blocks[0], ptr_y,     ctx->m.linesize);
-    dsp->get_pixels(ctx->blocks[1], ptr_y + 8, ctx->m.linesize);
-    dsp->get_pixels(ctx->blocks[2], ptr_u,     ctx->m.uvlinesize);
-    dsp->get_pixels(ctx->blocks[3], ptr_v,     ctx->m.uvlinesize);
+    dsp->get_pixels(ctx->blocks[0], ptr_y,      ctx->m.linesize);
+    dsp->get_pixels(ctx->blocks[1], ptr_y + bw, ctx->m.linesize);
+    dsp->get_pixels(ctx->blocks[2], ptr_u,      ctx->m.uvlinesize);
+    dsp->get_pixels(ctx->blocks[3], ptr_v,      ctx->m.uvlinesize);
 
     if (mb_y+1 == ctx->m.mb_height && ctx->m.avctx->height == 1080) {
         if (ctx->interlaced) {
-            ctx->get_pixels_8x4_sym(ctx->blocks[4], ptr_y + ctx->dct_y_offset,     ctx->m.linesize);
-            ctx->get_pixels_8x4_sym(ctx->blocks[5], ptr_y + ctx->dct_y_offset + 8, ctx->m.linesize);
-            ctx->get_pixels_8x4_sym(ctx->blocks[6], ptr_u + ctx->dct_uv_offset,    ctx->m.uvlinesize);
-            ctx->get_pixels_8x4_sym(ctx->blocks[7], ptr_v + ctx->dct_uv_offset,    ctx->m.uvlinesize);
+            ctx->get_pixels_8x4_sym(ctx->blocks[4], ptr_y + ctx->dct_y_offset,      ctx->m.linesize);
+            ctx->get_pixels_8x4_sym(ctx->blocks[5], ptr_y + ctx->dct_y_offset + bw, ctx->m.linesize);
+            ctx->get_pixels_8x4_sym(ctx->blocks[6], ptr_u + ctx->dct_uv_offset,     ctx->m.uvlinesize);
+            ctx->get_pixels_8x4_sym(ctx->blocks[7], ptr_v + ctx->dct_uv_offset,     ctx->m.uvlinesize);
         } else {
             dsp->clear_block(ctx->blocks[4]);
             dsp->clear_block(ctx->blocks[5]);
@@ -387,10 +487,10 @@ static av_always_inline void dnxhd_get_blocks(DNXHDEncContext *ctx, int mb_x, in
             dsp->clear_block(ctx->blocks[7]);
         }
     } else {
-        dsp->get_pixels(ctx->blocks[4], ptr_y + ctx->dct_y_offset,     ctx->m.linesize);
-        dsp->get_pixels(ctx->blocks[5], ptr_y + ctx->dct_y_offset + 8, ctx->m.linesize);
-        dsp->get_pixels(ctx->blocks[6], ptr_u + ctx->dct_uv_offset,    ctx->m.uvlinesize);
-        dsp->get_pixels(ctx->blocks[7], ptr_v + ctx->dct_uv_offset,    ctx->m.uvlinesize);
+        dsp->get_pixels(ctx->blocks[4], ptr_y + ctx->dct_y_offset,      ctx->m.linesize);
+        dsp->get_pixels(ctx->blocks[5], ptr_y + ctx->dct_y_offset + bw, ctx->m.linesize);
+        dsp->get_pixels(ctx->blocks[6], ptr_u + ctx->dct_uv_offset,     ctx->m.uvlinesize);
+        dsp->get_pixels(ctx->blocks[7], ptr_v + ctx->dct_uv_offset,     ctx->m.uvlinesize);
     }
 }
 
@@ -417,7 +517,7 @@ static int dnxhd_calc_bits_thread(AVCodecContext *avctx, void *arg, int jobnr, i
 
     ctx->m.last_dc[0] =
     ctx->m.last_dc[1] =
-    ctx->m.last_dc[2] = 1024;
+    ctx->m.last_dc[2] = 1 << (ctx->cid_table->bit_depth + 2);
 
     for (mb_x = 0; mb_x < ctx->m.mb_width; mb_x++) {
         unsigned mb = mb_y * ctx->m.mb_width + mb_x;
@@ -440,6 +540,8 @@ static int dnxhd_calc_bits_thread(AVCodecContext *avctx, void *arg, int jobnr, i
             diff = block[0] - ctx->m.last_dc[n];
             if (diff < 0) nbits = av_log2_16bit(-2*diff);
             else          nbits = av_log2_16bit( 2*diff);
+
+            assert(nbits < ctx->cid_table->bit_depth + 4);
             dc_bits += ctx->cid_table->dc_bits[nbits] + nbits;
 
             ctx->m.last_dc[n] = block[0];
@@ -465,7 +567,7 @@ static int dnxhd_encode_thread(AVCodecContext *avctx, void *arg, int jobnr, int
 
     ctx->m.last_dc[0] =
     ctx->m.last_dc[1] =
-    ctx->m.last_dc[2] = 1024;
+    ctx->m.last_dc[2] = 1 << (ctx->cid_table->bit_depth + 2);
     for (mb_x = 0; mb_x < ctx->m.mb_width; mb_x++) {
         unsigned mb = mb_y * ctx->m.mb_width + mb_x;
         int qscale = ctx->mb_qscale[mb];
@@ -515,13 +617,39 @@ static int dnxhd_mb_var_thread(AVCodecContext *avctx, void *arg, int jobnr, int
     DNXHDEncContext *ctx = avctx->priv_data;
     int mb_y = jobnr, mb_x;
     ctx = ctx->thread[threadnr];
-    for (mb_x = 0; mb_x < ctx->m.mb_width; mb_x++) {
-        unsigned mb  = mb_y * ctx->m.mb_width + mb_x;
-        uint8_t *pix = ctx->thread[0]->src[0] + ((mb_y<<4) * ctx->m.linesize) + (mb_x<<4);
-        int sum      = ctx->m.dsp.pix_sum(pix, ctx->m.linesize);
-        int varc     = (ctx->m.dsp.pix_norm1(pix, ctx->m.linesize) - (((unsigned)(sum*sum))>>8)+128)>>8;
-        ctx->mb_cmp[mb].value = varc;
-        ctx->mb_cmp[mb].mb = mb;
+    if (ctx->cid_table->bit_depth == 8) {
+        uint8_t *pix = ctx->thread[0]->src[0] + ((mb_y<<4) * ctx->m.linesize);
+        for (mb_x = 0; mb_x < ctx->m.mb_width; ++mb_x, pix += 16) {
+            unsigned mb  = mb_y * ctx->m.mb_width + mb_x;
+            int sum = ctx->m.dsp.pix_sum(pix, ctx->m.linesize);
+            int varc = (ctx->m.dsp.pix_norm1(pix, ctx->m.linesize) - (((unsigned)(sum*sum))>>8)+128)>>8;
+            ctx->mb_cmp[mb].value = varc;
+            ctx->mb_cmp[mb].mb = mb;
+        }
+    } else { // 10-bit
+        int const linesize = ctx->m.linesize >> 1;
+        for (mb_x = 0; mb_x < ctx->m.mb_width; ++mb_x) {
+            uint16_t *pix = (uint16_t*)ctx->thread[0]->src[0] + ((mb_y << 4) * linesize) + (mb_x << 4);
+            unsigned mb  = mb_y * ctx->m.mb_width + mb_x;
+            int sum = 0;
+            int sqsum = 0;
+            int mean, sqmean;
+            // Macroblocks are 16x16 pixels, unlike DCT blocks which are 8x8.
+            for (int i = 0; i < 16; ++i) {
+                for (int j = 0; j < 16; ++j) {
+                    // Turn 16-bit pixels into 10-bit ones.
+                    int const sample = (unsigned)pix[j] >> 6;
+                    sum += sample;
+                    sqsum += sample * sample;
+                    // 2^10 * 2^10 * 16 * 16 = 2^28, which is less than INT_MAX
+                }
+                pix += linesize;
+            }
+            mean = sum >> 8; // 16*16 == 2^8
+            sqmean = sqsum >> 8;
+            ctx->mb_cmp[mb].value = sqmean - mean * mean;
+            ctx->mb_cmp[mb].mb = mb;
+        }
     }
     return 0;
 }
@@ -871,7 +999,7 @@ AVCodec ff_dnxhd_encoder = {
     dnxhd_encode_picture,
     dnxhd_encode_end,
     .capabilities = CODEC_CAP_SLICE_THREADS,
-    .pix_fmts = (const enum PixelFormat[]){PIX_FMT_YUV422P, PIX_FMT_NONE},
+    .pix_fmts = (const enum PixelFormat[]){PIX_FMT_YUV422P, PIX_FMT_YUV422P10, PIX_FMT_NONE},
     .long_name = NULL_IF_CONFIG_SMALL("VC3/DNxHD"),
     .priv_class = &class,
 };
-- 
cgit v1.2.3