xiph
/
daala
miroir de https://git.xiph.org/daala.git


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859
							#LyX 2.0 created this file. For more info see http://www.lyx.org/
\lyxformat 413
\begin_document
\begin_header
\textclass article
\use_default_options true
\maintain_unincluded_children false
\language english
\language_package default
\inputencoding auto
\fontencoding global
\font_roman default
\font_sans default
\font_typewriter default
\font_default_family default
\use_non_tex_fonts false
\font_sc false
\font_osf false
\font_sf_scale 100
\font_tt_scale 100

\graphics default
\default_output_format default
\output_sync 0
\bibtex_command default
\index_command default
\paperfontsize default
\use_hyperref false
\papersize default
\use_geometry false
\use_amsmath 1
\use_esint 1
\use_mhchem 1
\use_mathdots 1
\cite_engine basic
\use_bibtopic false
\use_indices false
\paperorientation portrait
\suppress_date false
\use_refstyle 1
\index Index
\shortcut idx
\color #008000
\end_index
\secnumdepth 3
\tocdepth 3
\paragraph_separation indent
\paragraph_indentation default
\quotes_language english
\papercolumns 1
\papersides 1
\paperpagestyle default
\tracking_changes false
\output_changes false
\html_math_output 0
\html_css_as_file 0
\html_be_strict false
\end_header

\begin_body

\begin_layout Title
Energy Preservation in PVQ-Based Video Coding
\end_layout

\begin_layout Author
Jean-Marc Valin
\end_layout

\begin_layout Section
Introduction
\end_layout

\begin_layout Standard
This mini-paper describes a proposal for adapting the CELT energy conservation
 principle to video coding based on a pyramid vector quantizer (PVQ).
 One potential advantage of conserving energy of the AC coefficients in
 video coding is preserving textures rather than low-passing them.
 Also, by introducing a fixed-resolution PVQ-type quantizer, we automatically
 gain a simple activity masking model.
\end_layout

\begin_layout Standard
The main challenge of adapting this scheme to video is that we have a good
 prediction (the reference frame), so we are essentially starting from a
 point that is already on the PVQ hyper-sphere, rather than at the origin
 like in CELT.
 Other challenges are the introduction of a quantization matrix and the
 fact that we want the reference (motion predicted) data to perfectly correspond
 to one of the entries in our codebook.
\end_layout

\begin_layout Section
Encoder
\end_layout

\begin_layout Standard
Let vector 
\begin_inset Formula $\mathbf{x}_{d}$
\end_inset

 denote the (pre-normalization) DCT band to be coded in the current block
 and vector 
\begin_inset Formula $\mathbf{r}_{d}$
\end_inset

 denote the corresponding reference after motion compensation, the encoder
 computes and encodes the 
\begin_inset Quotes eld
\end_inset

band gain
\begin_inset Quotes erd
\end_inset


\begin_inset Formula 
\begin{equation}
g=\sqrt{\mathbf{x}_{d}^{T}\mathbf{x}_{d}}\,.\label{eq:band-energy}
\end{equation}

\end_inset


\end_layout

\begin_layout Standard
Let 
\begin_inset Formula $\mathbf{Q}$
\end_inset

 be a diagonal matrix containing the quantization step size for each element
 of 
\begin_inset Formula $\mathbf{x}_{d}$
\end_inset

, the normalized band is computed as 
\begin_inset Formula 
\begin{equation}
\mathbf{x}=\frac{\mathbf{Q}^{-1}\mathbf{x}_{d}}{\left\Vert \mathbf{Q}^{-1}\mathbf{x}_{d}\right\Vert }\,,\label{eq:normalized-x}
\end{equation}

\end_inset

with the normalized reference 
\begin_inset Formula $\mathbf{r}$
\end_inset

 similarly computed based on 
\begin_inset Formula $\mathbf{r}_{d}$
\end_inset

.
 The encoder then finds the position and sign of the maximum value in 
\begin_inset Formula $\mathbf{r}$
\end_inset


\begin_inset Formula 
\begin{align}
m & =\underset{i}{\mathrm{argmax}}\left|r_{i}\right|\label{eq:reflection-argmax}\\
s & =\mathrm{sgn}\left(r_{m}\right)\label{eq:reflection-sign}
\end{align}

\end_inset

and computes the Householder reflection that reflects 
\begin_inset Formula $\mathbf{r}$
\end_inset

 to 
\begin_inset Formula $-s\mathbf{e}_{m}$
\end_inset

.
 The reflection vector is given by
\begin_inset Formula 
\begin{equation}
\mathbf{v}=\mathbf{r}+s\mathbf{e}_{m}\,.\label{eq:reflection-vector}
\end{equation}

\end_inset

The encoder reflects the normalized band to find
\begin_inset Formula 
\begin{equation}
\mathbf{z}=\mathbf{x}-\frac{2}{\mathbf{v}^{T}\mathbf{v}}\mathbf{v}\left(\mathbf{v}^{T}\mathbf{x}\right)\,.\label{eq:reflection}
\end{equation}

\end_inset


\end_layout

\begin_layout Standard
The similarity between the current band and the reference band is represented
 by the angle (assuming no quantization)
\end_layout

\begin_layout Standard
\begin_inset Formula 
\begin{equation}
\theta=\arccos\frac{-sz_{m}}{\left\Vert \mathbf{z}\right\Vert }\ .\label{eq:unquant-theta}
\end{equation}

\end_inset

Let 
\begin_inset Formula $N$
\end_inset

 be the number of dimensions in 
\begin_inset Formula $\mathbf{x}$
\end_inset

 and 
\begin_inset Formula $K$
\end_inset

 be the number of pulses in our codebooks, we search for the codebook entry
 
\begin_inset Formula 
\begin{equation}
q=\underset{i}{\mathrm{argmax}}\frac{\mathbf{p}_{i}^{T}\left(\mathbf{z}+sz_{m}\mathbf{e}_{m}\right)}{\sqrt{\mathbf{p}_{i}^{T}\mathbf{p}_{i}}}\,,\label{eq:quantization}
\end{equation}

\end_inset

where 
\begin_inset Formula $\mathbf{p}_{i}$
\end_inset

 is the 
\begin_inset Formula $i^{th}$
\end_inset

 combination of magnitudes and signs that satisfies 
\begin_inset Formula $\left\Vert \mathbf{p}_{i}\right\Vert _{L1}=K$
\end_inset

.
 Let 
\begin_inset Formula $\theta_{opt}$
\end_inset

 be the post-quantization optimal angle, the mean square error becomes
\begin_inset Formula 
\begin{equation}
E=\left(-s\cos\theta_{opt}\mathbf{e}_{m}+\sin\theta_{opt}\hat{\mathbf{p}}-\mathbf{z}\right)^{2}\ ,\label{eq:error-theta}
\end{equation}

\end_inset

where 
\begin_inset Formula $\hat{\mathbf{p}}=\mathbf{p}_{q}/\sqrt{\mathbf{p}_{q}^{T}\mathbf{p}_{q}}$
\end_inset

Solving for 
\begin_inset Formula $\frac{\partial E}{\partial\theta}=0$
\end_inset

 and knowing that 
\begin_inset Formula $\hat{\mathbf{p}}^{T}\mathbf{e}_{m}=0$
\end_inset

, we have
\end_layout

\begin_layout Standard
\begin_inset Formula 
\begin{align}
\frac{\partial}{\partial\theta}\left(-s\cos\theta_{opt}\mathbf{e}_{m}+\sin\theta_{opt}\hat{\mathbf{p}}\right)\mathbf{z} & =0\nonumber \\
\left(s\sin\theta_{opt}\mathbf{e}_{m}+\cos\theta_{opt}\hat{\mathbf{p}}\right)\mathbf{z} & =0\nonumber \\
\sin\theta_{opt}\mathbf{e}_{m}\mathbf{z} & =-s\cos\theta_{opt}\hat{\mathbf{p}}\mathbf{z}\nonumber \\
\theta_{opt} & =-s\arctan\frac{\hat{\mathbf{p}}\mathbf{z}}{\mathbf{e}_{m}\mathbf{z}}\ .\label{eq:theta-opt}
\end{align}

\end_inset


\end_layout

\begin_layout Standard
The resolution of the gain and angle, as well as the number of pulses should
 all be derived from a single quality parameter.
 The encoder transmits the gain 
\begin_inset Formula $g$
\end_inset

, the quantized angle 
\begin_inset Formula $\hat{\theta}$
\end_inset

, and the 
\begin_inset Formula $\mathbf{p}_{q}$
\end_inset

 vector.
 Neither 
\begin_inset Formula $s$
\end_inset

 nor 
\begin_inset Formula $m$
\end_inset

 need to be transmitted since they can be obtained from the decoder.
 Encoding 
\begin_inset Formula $\mathbf{p}_{q}$
\end_inset

 should make use of the fact that 
\begin_inset Formula $K$
\end_inset

 is known and is left as an exercise to the implementer.
 
\end_layout

\begin_layout Section
Decoder
\end_layout

\begin_layout Standard
The decoder starts by decoding the codebook entry 
\begin_inset Formula $\mathbf{p}_{q}$
\end_inset

 and uses it to reconstruct the unit-norm reflected band as
\begin_inset Formula 
\begin{equation}
\hat{\mathbf{z}}=-s\cos\hat{\theta}\mathbf{e}_{m}+\sin\hat{\theta}\frac{\mathbf{p}_{q}}{\sqrt{\mathbf{p}_{q}^{T}\mathbf{p}_{q}}}\,.\label{eq:reconstruct}
\end{equation}

\end_inset

Because the decoder has access to exactly the same reference as the encoder,
 it is able to apply 
\begin_inset CommandInset ref
LatexCommand eqref
reference "eq:reflection-argmax"

\end_inset

-
\begin_inset CommandInset ref
LatexCommand eqref
reference "eq:reflection-vector"

\end_inset

 to obtain the same 
\begin_inset Formula $\mathbf{v}$
\end_inset

 as used in the encoder.
 The decoded normalized band is 
\begin_inset Formula 
\begin{equation}
\hat{\mathbf{x}}=\hat{\mathbf{z}}-\frac{2}{\mathbf{v}^{T}\mathbf{v}}\mathbf{v}\left(\mathbf{v}^{T}\hat{\mathbf{x}}\right)\,.\label{eq:decoder-reflection}
\end{equation}

\end_inset


\end_layout

\begin_layout Standard
The renormalized band is computed by taking into account the quantization
 resolution:
\begin_inset Formula 
\begin{equation}
\hat{\mathbf{x}}_{d}=\hat{g}\frac{\mathbf{Q}\hat{\mathbf{x}}}{\left\Vert \mathbf{Q}\hat{\mathbf{x}}\right\Vert }\,.\label{eq:decoded-band}
\end{equation}

\end_inset


\end_layout

\begin_layout Section
Coding Resolution
\end_layout

\begin_layout Standard
It is desirable for a single quality parameter to control 
\begin_inset Formula $K$
\end_inset

 and the resolution of gain and angle.
 That quality parameter should also take into account activity masking to
 some extent.
 According to Jason Garrett-Glaser, x264's activity masking uses a resolution
 proportional to the 
\begin_inset Formula $g^{2\alpha}$
\end_inset

, with 
\begin_inset Formula $\alpha=0.173$
\end_inset

.
 We can derive a scalar quantizer that follows this resolution:
\begin_inset Formula 
\begin{equation}
\hat{g}=Q_{g}\gamma^{1+2\alpha}\ ,\label{eq:gain-scalar-quantization}
\end{equation}

\end_inset

where 
\begin_inset Formula $\gamma$
\end_inset

 is the gain quantization index and 
\begin_inset Formula $Q_{g}$
\end_inset

 is the gain resolution and 
\begin_inset Quotes eld
\end_inset

master
\begin_inset Quotes erd
\end_inset

 quality parameter.
 If we assume that MSE is a good criterion, then the angle quantization
 resolution should be (roughly) 
\begin_inset Formula 
\begin{equation}
Q_{\theta}=\frac{d\hat{g}/d\gamma}{\hat{g}}=\frac{Q_{g}\left(1+2\alpha\right)\gamma^{2\alpha}}{Q_{g}\gamma^{1+2\alpha}}=\frac{\left(1+2\alpha\right)}{\gamma}\ .\label{eq:theta-quantization-step}
\end{equation}

\end_inset


\end_layout

\begin_layout Standard
To derive the optimal 
\begin_inset Formula $K$
\end_inset

 we need to consider the cosine distance between adjacent codevectors 
\begin_inset Formula $\mathbf{p}_{1}$
\end_inset

 and 
\begin_inset Formula $\mathbf{p}_{2}$
\end_inset

 for two cases: 
\begin_inset Formula $K<N$
\end_inset

 and 
\begin_inset Formula $K>N$
\end_inset

.
 For 
\begin_inset Formula $K<N$
\end_inset

, the worst resolution occurs when no value in 
\begin_inset Formula $\mathbf{p}_{q}$
\end_inset

 is larger than one.
 In that case, the two closest codevectors have a cosine distance
\begin_inset Formula 
\begin{align*}
\cos\tau & =\frac{\mathbf{p}_{1}^{T}\mathbf{p}_{2}}{\sqrt{\mathbf{p}_{2}^{T}\mathbf{p}_{2}}\sqrt{\mathbf{p}_{2}^{T}\mathbf{p}_{2}}}\\
 & =\frac{K-1}{K}\\
 & =1-\frac{1}{K}
\end{align*}

\end_inset

By approximating the cosine, we then get
\begin_inset Formula 
\begin{align}
1-\frac{1}{K} & =\cos\tau\approx1-\frac{\tau^{2}}{2}\nonumber \\
K & \approx\frac{2}{\tau^{2}}\label{eq:small-K}
\end{align}

\end_inset

For 
\begin_inset Formula $K>N$
\end_inset

 the worst resolution happens when all values are equal to 
\begin_inset Formula $K/N$
\end_inset

 in 
\begin_inset Formula $\mathbf{p}_{1}$
\end_inset

 and 
\begin_inset Formula $\mathbf{p}_{2}$
\end_inset

 differs by one pulse.
 In that case
\begin_inset Formula 
\begin{align*}
\cos\tau & =\frac{K^{2}/N}{\sqrt{\frac{K^{2}}{N}}\sqrt{\frac{K^{2}}{N}+2}}\\
 & =\frac{1}{\sqrt{1+\frac{2N}{K^{2}}}}\\
 & \approx1-\frac{N}{K^{2}}
\end{align*}

\end_inset

By approximating the cosine, we get
\begin_inset Formula 
\begin{align}
1-\frac{N}{K^{2}} & =\cos\tau\approx1-\frac{\tau^{2}}{2}\nonumber \\
K & \approx\frac{\sqrt{2N}}{\tau}\ .\label{eq:large-K}
\end{align}

\end_inset


\end_layout

\begin_layout Standard
By combining 
\begin_inset CommandInset ref
LatexCommand eqref
reference "eq:small-K"

\end_inset

 with 
\begin_inset CommandInset ref
LatexCommand eqref
reference "eq:large-K"

\end_inset

, we have
\begin_inset Formula 
\begin{equation}
K\approx\min\left(\frac{\sqrt{2N}}{\tau},\frac{2}{\tau^{2}}\right)\label{eq:pulse-allocation}
\end{equation}

\end_inset

The last step is to set 
\begin_inset Formula 
\begin{equation}
\tau=Q_{\theta}/\sin\hat{\theta}\label{eq:tau-from-theta}
\end{equation}

\end_inset

to account for the fact that the more the image differs from the reference,
 the higher the resolution needs to be.
\end_layout

\begin_layout Section
Bi-Prediction
\end_layout

\begin_layout Standard
We can use this scheme for bi-prediction by introducing a second 
\begin_inset Formula $\theta$
\end_inset

 parameter.
 For the case of two (normalized) reference frames 
\begin_inset Formula $\mathbf{r}_{1}$
\end_inset

 and 
\begin_inset Formula $\mathbf{r}_{2}$
\end_inset

, we introduce 
\begin_inset Formula $\mathbf{s}_{1}=\left(\mathbf{r}_{1}+\mathbf{r}_{2}\right)/2$
\end_inset

 and 
\begin_inset Formula $\mathbf{s}_{2}=\left(\mathbf{r}_{1}-\mathbf{r}_{2}\right)/2$
\end_inset

.
 We start by using 
\begin_inset Formula $\mathbf{s}_{1}$
\end_inset

 as a reference, apply the Householder reflection to both 
\begin_inset Formula $\mathbf{x}$
\end_inset

 and 
\begin_inset Formula $\mathbf{s}_{2}$
\end_inset

, and evaluate 
\begin_inset Formula $\theta_{1}$
\end_inset

.
 From there, we derive a second Householder reflection from the reflected
 version of 
\begin_inset Formula $\mathbf{s}_{2}$
\end_inset

 and apply it to 
\begin_inset Formula $\mathbf{x}_{r}$
\end_inset

.
 The result is that the 
\begin_inset Formula $\theta_{2}$
\end_inset

 parameter controls how the current image compares to the two reference
 images.
 It should even be possible to use this in the case where the two references
 are before the frame being encoded, i.e.
 P frames based on two parents.
 This might help for fades.
\end_layout

\begin_layout Section
Theoretical Ramblings on SSIM
\end_layout

\begin_layout Standard
According to Wikipedia, the SSIM metric is defined as
\begin_inset Formula 
\[
\mathrm{SSIM}\left(x,y\right)=\left(\frac{\mu_{x}\mu_{y}+c_{1}}{\mu_{x}^{2}+\mu_{y}^{2}+c_{1}}\right)\cdot\left(\frac{\sigma_{xy}+c_{2}}{\sigma_{x}^{2}+\sigma_{y}^{2}+c_{2}}\right)\,.
\]

\end_inset

Where 
\begin_inset Formula $\mu_{x}$
\end_inset

 and 
\begin_inset Formula $\mu_{y}$
\end_inset

 are the DC of images 
\begin_inset Formula $x$
\end_inset

 and 
\begin_inset Formula $y$
\end_inset

 and 
\begin_inset Formula $\sigma_{x}$
\end_inset

 and 
\begin_inset Formula $\sigma_{y}$
\end_inset

 are the RMS value of the AC coefficients of images 
\begin_inset Formula $x$
\end_inset

 and 
\begin_inset Formula $y$
\end_inset

.
 From now on, we will consider 
\begin_inset Formula $x$
\end_inset

 to be the reference image and 
\begin_inset Formula $y$
\end_inset

 to be the coded image.
 Now, let's ignore the DC for now and define a Simplified SSIM metric as
\begin_inset Formula 
\[
\mathrm{SSSIM}\left(x,y\right)=\frac{2\sigma_{xy}+c_{2}}{\sigma_{x}^{2}+\sigma_{y}^{2}+c_{2}}\,.
\]

\end_inset

This is the metric we'll try optimizing here.
 First, let 
\begin_inset Formula $g=\sigma_{y}/\sigma_{x}$
\end_inset

 be gain that the codec causes on the AC coefficients and 
\begin_inset Formula $\hat{y}=y/g$
\end_inset

.
 Solving for 
\begin_inset Formula 
\[
\frac{d}{dg}\mathrm{SSSIM}\left(x,y\right)=\frac{d}{dg}\frac{2g\sigma_{x\hat{y}}+c_{2}}{\sigma_{x}^{2}\left(1+g^{2}\right)+c_{2}}=0
\]

\end_inset

we find that the optimal gain that maximizes SSSIM is 
\begin_inset Formula $g_{max}\approx1-\frac{c_{2}}{2\sigma_{x}^{2}}\cdot\left(\frac{\sigma_{x}^{2}}{\sigma_{x\hat{y}}}-1\right)$
\end_inset

.
 This means that conserving energy (
\begin_inset Formula $g_{max}=1$
\end_inset

) is a good thing to do as long as the contrat is high enough (
\family roman
\series medium
\shape up
\size normal
\emph off
\bar no
\strikeout off
\uuline off
\uwave off
\noun off
\color none

\begin_inset Formula $\frac{c_{2}}{2\sigma_{x}^{2}}$
\end_inset

 is small) or the bit-rate is high enough (
\begin_inset Formula $\frac{\sigma_{x}^{2}}{\sigma_{x\hat{y}}}$
\end_inset

 close to 1).
\end_layout

\begin_layout Standard
Now, let's consider a spherical horse in simple harmonic motion...
 or to be more exact, let's consider that the PVQ codebook is perfectly
 uniform over the sphere and that 
\begin_inset Formula $g_{max}=1$
\end_inset

.
 We get 
\begin_inset Formula 
\[
\mathrm{SSSIM}\left(x,y\right)=\frac{\sigma_{xy}+c_{2}/2}{\sigma_{x}^{2}+c_{2}/2}\,,
\]

\end_inset

where 
\begin_inset Formula $\sigma_{xy}/\sigma_{x}^{2}=\cos\theta$
\end_inset

 is the cosine distance between 
\begin_inset Formula $x$
\end_inset

 and 
\begin_inset Formula $y$
\end_inset

.
 Assuming a uniform quantizer, we have 
\begin_inset Formula 
\[
\theta\propto2^{-b/(N-1)}\,,
\]

\end_inset

where 
\begin_inset Formula $b$
\end_inset

 is the number of bits allocated and 
\begin_inset Formula $N$
\end_inset

 is the number of AC coefficients.
 Let 
\begin_inset Formula $c'=c/(2\sigma_{x}^{2})$
\end_inset

...
\end_layout

\begin_layout Standard
<FIXME: This needs to be cleaned up>
\end_layout

\begin_layout Standard
\begin_inset Formula 
\[
\mathrm{SSSIM}\left(x,y\right)=\frac{\cos\theta+c'}{1+c'}\approx\frac{1-\theta^{2}+c'}{1+c'}\,,
\]

\end_inset


\end_layout

\begin_layout Standard
Trying to make SSIM equal for two blocks:
\begin_inset Formula 
\[
\frac{1+c_{1}'-2^{-2b_{1}/(N-1)}}{1+c_{1}'}=\frac{1+c_{2}'-2^{-2b_{2}/(N-1)}}{1+c_{2}'}
\]

\end_inset


\end_layout

\begin_layout Standard
The optimal bit offset is 
\begin_inset Formula 
\[
b=-\frac{N-1}{2}\log_{2}\left(1+2c_{2}/\sigma_{x}^{2}\right)
\]

\end_inset

 From this (theoretically) optimal offset, we can encode only the deviation
 from the optimal allocation.
 In practice, 
\begin_inset Formula $b$
\end_inset

 would not be an exact bit allocation like for CELT, but only the 
\begin_inset Quotes eld
\end_inset

quantization step exponent
\begin_inset Quotes erd
\end_inset

.
\end_layout

\begin_layout Section
Conclusion
\end_layout

\begin_layout Standard
While it seems like a good idea, we're still experimenting with the details.
\end_layout

\end_body
\end_document