Initial commit.

author: Anton Khirnov <anton@khirnov.net> 2018-04-07 18:13:49 +0200
committer: Anton Khirnov <anton@khirnov.net> 2018-04-07 18:13:49 +0200
commit: d53f73b7f7c728e96ffc07b55fa30b9e6fc5121c (patch)
tree: c79d588e646ea3e65b2db1532d65bb691ee88b0e
26 files changed, 9464 insertions, 0 deletions
diff --git a/configuration.ccl b/configuration.ccl
new file mode 100644
index 0000000..3565166
--- /dev/null
+++ b/configuration.ccl
@@ -0,0 +1,2 @@
+# Configuration definition for thorn MinimalDistortionAxi
+
diff --git a/interface.ccl b/interface.ccl
new file mode 100644
index 0000000..703f924
--- /dev/null
+++ b/interface.ccl
@@ -0,0 +1,16 @@
+# Interface definition for thorn MinimalDistortionAxi
+implements: MinimalDistortionAxi
+
+INHERITS: ADMBase grid CoordBase MethodOfLines
+
+CCTK_INT FUNCTION MoLRegisterConstrained(CCTK_INT IN idx)
+CCTK_INT FUNCTION MoLRegisterSaveAndRestore(CCTK_INT IN idx)
+CCTK_INT FUNCTION MoLRegisterSaveAndRestoreGroup(CCTK_INT IN idx)
+
+REQUIRES FUNCTION MoLRegisterConstrained
+REQUIRES FUNCTION MoLRegisterSaveAndRestore
+REQUIRES FUNCTION MoLRegisterSaveAndRestoreGroup
+
+public:
+CCTK_REAL betax_coeffs TYPE=array DIM=2 SIZE=basis_order_z,basis_order_r DISTRIB=constant
+CCTK_REAL betaz_coeffs TYPE=array DIM=2 SIZE=basis_order_z,basis_order_r DISTRIB=constant
diff --git a/param.ccl b/param.ccl
new file mode 100644
index 0000000..3c3d285
--- /dev/null
+++ b/param.ccl
@@ -0,0 +1,37 @@
+# Parameter definitions for thorn MinimalDistortionAxi
+#
+SHARES: ADMBase
+EXTENDS KEYWORD shift_evolution_method
+{
+  "minimal_distortion_axi" :: "minimal distortion axi"
+}
+
+RESTRICTED:
+CCTK_INT basis_order_r "Number of the basis functions in the radial direction" STEERABLE=recover
+{
+    1: :: ""
+} 40
+
+CCTK_INT basis_order_z "Number of the basis functions in the z direction" STEERABLE=recover
+{
+    1: :: ""
+} 40
+
+CCTK_REAL filter_power "" STEERABLE=recover
+{
+    0: :: ""
+} 64.0
+
+CCTK_REAL scale_factor "" STEERABLE=recover
+{
+    0: :: ""
+} 64.0
+
+CCTK_REAL scale_power "" STEERABLE=recover
+{
+    0: :: ""
+} 64.0
+
+BOOLEAN export_coeffs "Export the coefficients of the spectral expansion in beta*_coeffs" STEERABLE=recover
+{
+} "no"
diff --git a/schedule.ccl b/schedule.ccl
new file mode 100644
index 0000000..5976500
--- /dev/null
+++ b/schedule.ccl
@@ -0,0 +1,30 @@
+# Schedule definitions for thorn MinimalDistortionAxi
+#
+if (CCTK_Equals(shift_evolution_method, "minimal_distortion_axi")) {
+    SCHEDULE minimal_distortion_eval IN ML_BSSN_evolCalcGroup BEFORE ML_BSSN_RHS {
+        LANG: C
+    } "Minimal distortion shift eval"
+
+    SCHEDULE minimal_distortion_solve IN ML_BSSN_evolCalcGroup BEFORE minimal_distortion_eval {
+    #SCHEDULE minimal_distortion_solve IN MoL_PreStep {
+        LANG: C
+    } "Minimal distortion solve W"
+
+    #SCHEDULE quasimaximal_slicing_axi IN MoL_PseudoEvolution {
+    #    LANG: C
+    #} "Quasimaximal slicing"
+
+    SCHEDULE minimal_distortion_init IN ADMBase_InitialData {
+        LANG: C
+    } ""
+
+    SCHEDULE minimal_distortion_axi_register_mol IN MoL_Register {
+        LANG: C
+    } ""
+
+
+    if (export_coeffs) {
+        STORAGE: betax_coeffs
+        STORAGE: betaz_coeffs
+    }
+}
diff --git a/src/MinimalDistortion.m b/src/MinimalDistortion.m
new file mode 100644
index 0000000..c0d34a6
--- /dev/null
+++ b/src/MinimalDistortion.m
@@ -0,0 +1,1490 @@
+
+SetEnhancedTimes[False];
+SetSourceLanguage["C"];
+
+(******************************************************************************)
+(* Options *)
+(******************************************************************************)
+
+createCode[derivOrder_, useJacobian_, splitUpwindDerivs_, evolutionTimelevels_, addMatter_, formulation_] :=
+Module[{prefix, suffix, thorn},
+
+prefix = "ML_";
+suffix =
+  ""
+  <> If [useJacobian, "_MP", ""]
+  <> If [derivOrder!=4, "_O" <> ToString[derivOrder], ""]
+  <> If [splitUpwindDerivs, "", "_UPW"]
+  (* <> If [evolutionTimelevels!=3, "_TL" <> ToString[evolutionTimelevels], ""] *)
+  (* <> If [addMatter==1, "_M", ""] *)
+  ;
+
+thorn = prefix <> formulation <> suffix;
+
+SetAttributes[IfCCZ4, HoldAll];
+IfCCZ4[expr_, else_:Sequence[]] := If[formulation === "CCZ4", expr, Unevaluated[else]];
+
+(******************************************************************************)
+(* Derivatives *)
+(******************************************************************************)
+
+KD = KroneckerDelta;
+
+derivatives =
+{
+  PDstandardNth[i_]    -> StandardCenteredDifferenceOperator[1,fdOrder/2,i],
+  PDstandardNth[i_,i_] -> StandardCenteredDifferenceOperator[2,fdOrder/2,i],
+  PDstandardNth[i_,j_] -> StandardCenteredDifferenceOperator[1,fdOrder/2,i] *
+                          StandardCenteredDifferenceOperator[1,fdOrder/2,j],
+  PDdissipationNth[i_] ->
+    (-1)^(fdOrder/2) *
+    spacing[i]^(fdOrder+1) / 2^(fdOrder+2) *
+    StandardCenteredDifferenceOperator[fdOrder+2,fdOrder/2+1,i],
+  
+(* PD: These come from my mathematica notebook
+   "Upwind-Kranc-Convert.nb" that converts upwinding finite
+   differencing operators generated by
+   StandardUpwindDifferenceOperator into this form *)
+
+  Sequence@@Flatten[Table[
+   {PDupwindNth[i] -> Switch[fdOrder,
+      2, (dir[i]*(-3 + 4*shift[i]^dir[i] - shift[i]^(2*dir[i])))/(2*spacing[i]),
+      4, (dir[i]*(-10 - 3/shift[i]^dir[i] + 18*shift[i]^dir[i] -
+          6*shift[i]^(2*dir[i]) + shift[i]^(3*dir[i])))/(12*spacing[i]),
+      6, (dir[i]*(-35 + 2/shift[i]^(2*dir[i]) - 24/shift[i]^dir[i] + 80*shift[i]^dir[i] -
+          30*shift[i]^(2*dir[i]) + 8*shift[i]^(3*dir[i]) - shift[i]^(4*dir[i])))/(60*spacing[i]),
+      8, (dir[i]*(-378 - 5/shift[i]^(3*dir[i]) + 60/shift[i]^(2*dir[i]) - 420/shift[i]^dir[i] +
+          1050*shift[i]^dir[i] - 420*shift[i]^(2*dir[i]) + 140*shift[i]^(3*dir[i]) - 30*shift[i]^(4*dir[i]) +
+          3*shift[i]^(5*dir[i])))/(840*spacing[i])],
+
+    PDupwindNthAnti[i] -> Switch[fdOrder,
+      2, (+1 shift[i]^(-2) -4 shift[i]^(-1) +0 shift[i]^( 0) +4 shift[i]^(+1) -1 shift[i]^(+2)) / (4 spacing[i]),
+      4, (-1 shift[i]^(-3) +6 shift[i]^(-2) -21 shift[i]^(-1 )+0 shift[i]^( 0) +21 shift[i]^(+1)
+          -6 shift[i]^(+2) +1 shift[i]^(+3)) / (24 spacing[i]),
+      6, (+1 shift[i]^(-4) -8 shift[i]^(-3) +32 shift[i]^(-2) -104 shift[i]^(-1) +0 shift[i]^( 0)
+          +104 shift[i]^(+1) -32 shift[i]^(+2) +8 shift[i]^(+3) -1 shift[i]^(+4)) / (120 spacing[i]),
+      8, (-3 shift[i]^(-5) +30 shift[i]^(-4) -145 shift[i]^(-3) +480 shift[i]^(-2) -1470 shift[i]^(-1)
+          +0 shift[i]^( 0) +1470 shift[i]^(+1) -480 shift[i]^(+2) +145 shift[i]^(+3) -30 shift[i]^(+4)
+          +3 shift[i]^(+5)) / (1680 spacing[i])],
+
+    PDupwindNthSymm[i] -> Switch[fdOrder,
+     2, (-1 shift[i]^(-2) +4 shift[i]^(-1) -6 shift[i]^( 0) +4 shift[i]^(+1) -1 shift[i]^(+2)) / (4 spacing[i]),
+     4, (+1 shift[i]^(-3) -6 shift[i]^(-2) +15 shift[i]^(-1) -20 shift[i]^( 0) +15 shift[i]^(+1)
+         -6 shift[i]^(+2) +1 shift[i]^(+3)) / (24 spacing[i]),
+     6, (-1 shift[i]^(-4) +8 shift[i]^(-3) - 28 shift[i]^(-2)+56 shift[i]^(-1)-70 shift[i]^( 0)
+         +56 shift[i]^(+1) -28 shift[i]^(+2) +8 shift[i]^(+3) -1 shift[i]^(+4)) / (120 spacing[i]),
+     8, (+3 shift[i]^(-5) -30 shift[i]^(-4) +135 shift[i]^(-3) -360 shift[i]^(-2) +630 shift[i]^(-1)
+         -756 shift[i]^( 0) +630 shift[i]^(+1) -360 shift[i]^(+2) +135 shift[i]^(+3) -30 shift[i]^(+4)
+         +3 shift[i]^(+5)) / (1680 spacing[i])],
+
+    (* TODO: make these higher order stencils *)
+    PDonesided[i] -> dir[i] (-1 + shift[i]^dir[i]) / spacing[i]} /. i->j, {j,1,3}],1]
+};
+
+PD     = PDstandardNth;
+PDu    = PDupwindNth;
+PDua   = PDupwindNthAnti;
+PDus   = PDupwindNthSymm;
+(* PDo    = PDonesided; *)
+PDdiss = PDdissipationNth;
+
+If [splitUpwindDerivs,
+    Upwind[dir_, var_, idx_] := dir PDua[var,idx] + Abs[dir] PDus[var,idx],
+    Upwind[dir_, var_, idx_] := dir PDu[var,idx]];
+
+
+
+(******************************************************************************)
+(* Tensors *)
+(******************************************************************************)
+
+(* Register the tensor quantities with the TensorTools package *)
+Map [DefineTensor,
+     {normal, tangentA, tangentB, dir,
+      nn, nu, nlen, nlen2, su, vg,
+      xx, rr, th, ph,
+      admg, admK, admalpha, admdtalpha, qmsw, admbeta, admdtbeta, H, M, term1, term2, term3,
+      g, detg, gu, G, R, trR, Km, trK, cdphi, cdphi2,
+      phi, gt, At, Xt, Xtn, Theta, Z,
+      (*
+    alpha, A,
+        *)
+      alpha,
+      beta, B, Atm, Atu, trA, Ats, trAts,
+      Kdot, Xtdot, phidot, K, Km,
+      dottrK, dotXt,
+      cXt, cS, cA,
+      e4phi, em4phi, ddetg, detgt, gtu, ddetgt, dgtu, ddgtu, Gtl, Gtlu, Gt, Ddetgt,
+      Rt, Rphi, gK,
+      T00, T0, T, rho, S,
+      x, y, z, r,
+      epsdiss}];
+
+(* NOTE: It seems as if Lie[.,.] did not take these tensor weights
+   into account.  Presumably, CD[.,.] and CDt[.,.] don't do this either.  *)
+SetTensorAttribute[phi, TensorWeight, +1/6];
+SetTensorAttribute[gt,  TensorWeight, -2/3];
+SetTensorAttribute[Xt,  TensorWeight, +2/3];
+SetTensorAttribute[At,  TensorWeight, -2/3];
+SetTensorAttribute[cXt, TensorWeight, +2/3];
+SetTensorAttribute[cS,  TensorWeight, +2  ];
+
+Map [AssertSymmetricIncreasing,
+     {admg[la,lb], admK[la,lb], g[la,lb], K[la,lb], R[la,lb], cdphi2[la,lb],
+      gt[la,lb], At[la,lb], Ats[la,lb], Rt[la,lb], Rphi[la,lb], T[la,lb], Kdot[la, lb]}];
+AssertSymmetricIncreasing [G[ua,lb,lc], lb, lc];
+AssertSymmetricIncreasing [Gtl[la,lb,lc], lb, lc];
+AssertSymmetricIncreasing [Gt[ua,lb,lc], lb, lc];
+AssertSymmetricIncreasing [gK[la,lb,lc], la, lb];
+Map [AssertSymmetricIncreasing,
+     {gu[ua,ub], gtu[ua,ub], Atu[ua,ub]}];
+AssertSymmetricIncreasing [dgtu[ua,ub,lc], ua, ub];
+AssertSymmetricIncreasing [ddgtu[ua,ub,lc,ld], ua, ub];
+AssertSymmetricIncreasing [ddgtu[ua,ub,lc,ld], lc, ld];
+
+DefineConnection [CD, PD, G];
+DefineConnection [CDt, PD, Gt];
+
+(* Use the CartGrid3D variable names *)
+x1=x; x2=y; x3=z;
+
+(* Use the ADMBase variable names *)
+admg11=gxx; admg12=gxy; admg22=gyy; admg13=gxz; admg23=gyz; admg33=gzz;
+admK11=kxx; admK12=kxy; admK22=kyy; admK13=kxz; admK23=kyz; admK33=kzz;
+admalpha=alp;
+admdtalpha=dtalp;
+admbeta1=betax; admbeta2=betay; admbeta3=betaz;
+admdtbeta1=dtbetax; admdtbeta2=dtbetay; admdtbeta3=dtbetaz;
+qmsw=W;
+(*alpha=admalpha;*)
+
+(* Use the TmunuBase variable names *)
+T00=eTtt;
+T01=eTtx; T02=eTty; T03=eTtz;
+T11=eTxx; T12=eTxy; T22=eTyy; T13=eTxz; T23=eTyz; T33=eTzz;
+
+
+
+(******************************************************************************)
+(* Expressions *)
+(******************************************************************************)
+
+(* enum constants for conformalMethod; these must be consistent
+   with the definition of the Cactus parameter conformalMethod *)
+CMphi = 0;
+CMW   = 1;
+
+detgExpr  = Det [MatrixOfComponents [g [la,lb]]];
+ddetgExpr[la_] =
+  Sum [D[Det[MatrixOfComponents[g[la, lb]]], X] PD[X, la],
+       {X, Union[Flatten[MatrixOfComponents[g[la, lb]]]]}];
+
+detgtExpr = Det [MatrixOfComponents [gt[la,lb]]];
+ddetgtExpr[la_] =
+  Sum [D[Det[MatrixOfComponents[gt[la, lb]]], X] PD[X, la],
+       {X, Union[Flatten[MatrixOfComponents[gt[la, lb]]]]}];
+
+etaExpr = SpatialBetaDriverRadius / Max [r, SpatialBetaDriverRadius];
+thetaExpr = Min [Exp [1 - r / SpatialShiftGammaCoeffRadius], 1];
+
+
+
+(******************************************************************************)
+(* Groups *)
+(******************************************************************************)
+
+evolvedGroups =
+  {SetGroupName [CreateGroupFromTensor [phi      ], prefix <> "log_confac"],
+   SetGroupName [CreateGroupFromTensor [gt[la,lb]], prefix <> "metric"    ],
+   SetGroupName [CreateGroupFromTensor [Xt[ua]   ], prefix <> "Gamma"     ],
+   SetGroupName [CreateGroupFromTensor [trK      ], prefix <> "trace_curv"],
+   SetGroupName [CreateGroupFromTensor [At[la,lb]], prefix <> "curv"      ],
+   SetGroupName [CreateGroupFromTensor [alpha    ], prefix <> "lapse"     ],
+(*SetGroupName [CreateGroupFromTensor [A        ], prefix <> "dtlapse"   ],*)
+   SetGroupName [CreateGroupFromTensor [Kdot[la, lb]], prefix <> "Kdot"   ],
+   SetGroupName [CreateGroupFromTensor [Xtdot[ua]], prefix <> "Xtdot"   ],
+   SetGroupName [CreateGroupFromTensor [phidot], prefix <> "phidot"   ],
+   SetGroupName [CreateGroupFromTensor [beta[ua] ], prefix <> "shift"     ],
+   SetGroupName [CreateGroupFromTensor [B[ua]    ], prefix <> "dtshift"   ],
+   IfCCZ4[SetGroupName[CreateGroupFromTensor[Theta], prefix <> "Theta"]]};
+evaluatedGroups =
+  {SetGroupName [CreateGroupFromTensor [H      ], prefix <> "Ham"],
+   SetGroupName [CreateGroupFromTensor [M[la]  ], prefix <> "mom"],
+   SetGroupName [CreateGroupFromTensor [term1    ], prefix <> "term1"],
+   SetGroupName [CreateGroupFromTensor [term2    ], prefix <> "term2"],
+   SetGroupName [CreateGroupFromTensor [term3    ], prefix <> "term3"],
+   SetGroupName [CreateGroupFromTensor [cS     ], prefix <> "cons_detg"],
+   SetGroupName [CreateGroupFromTensor [cXt[ua]], prefix <> "cons_Gamma"],
+   SetGroupName [CreateGroupFromTensor [cA     ], prefix <> "cons_traceA"]};
+
+declaredGroups = Join [evolvedGroups, evaluatedGroups];
+declaredGroupNames = Map [First, declaredGroups];
+
+
+
+extraGroups =
+  {{"Grid::coordinates", {x, y, z, r}},
+   {"ADMBase::metric",  {gxx, gxy, gxz, gyy, gyz, gzz}},
+   {"ADMBase::curv",    {kxx, kxy, kxz, kyy, kyz, kzz}},
+   {"ADMBase::lapse",   {alp}},
+   {"ADMBase::dtlapse", {dtalp}},
+   {"ADMBase::shift",   {betax, betay, betaz}},
+   {"ADMBase::dtshift", {dtbetax, dtbetay, dtbetaz}},
+   {"QuasiMaximalSlicing::W", { W }},
+   {"TmunuBase::stress_energy_scalar", {eTtt}},
+   {"TmunuBase::stress_energy_vector", {eTtx, eTty, eTtz}},
+   {"TmunuBase::stress_energy_tensor", {eTxx, eTxy, eTxz, eTyy, eTyz, eTzz}}
+};
+
+groups = Join [declaredGroups, extraGroups];
+
+
+
+(******************************************************************************)
+(* Initial data *)
+(******************************************************************************)
+
+initialCalc =
+{
+  Name -> thorn <> "_Minkowski",
+  Schedule -> {"IN ADMBase_InitialData"},
+  ConditionalOnKeyword -> {"my_initial_data", "Minkowski"},
+  Equations -> 
+  {
+    phi       -> IfThen[conformalMethod==CMW, 1, 0],
+    gt[la,lb] -> KD[la,lb],
+    trK       -> 0,
+    At[la,lb] -> 0,
+    Xt[ua]    -> 0,
+    (*alpha     -> 1,
+    A         -> 0,*)
+    beta[ua]  -> 0,
+    B[ua]     -> 0,
+    IfCCZ4[Theta -> 0]
+  }
+};
+
+
+
+(******************************************************************************)
+(* Split a calculation *)
+(******************************************************************************)
+
+PartialCalculation[calc_, suffix_, updates_, evolVars_] :=
+Module[
+  {name, calc1, replaces, calc2, vars, patterns, eqs, calc3},
+  (* Add suffix to name *)
+  name     = lookup[calc, Name] <> suffix;
+  calc1    = mapReplace[calc, Name, name];
+  (* Replace some entries in the calculation *)
+  (* replaces = Map[Function[rule, mapReplace[#, rule[[1]], rule[[2]]]&], updates]; *)
+  replaces = updates //. (lhs_ -> rhs_) -> (mapReplace[#, lhs, rhs]&);
+  calc2 = Apply[Composition, replaces][calc1];
+  (* Remove unnecessary equations *)
+  vars     = Join[evolVars, lookup[calc2, Shorthands]];
+  patterns = Replace[vars, {    Tensor[n_,__]  ->     Tensor[n,__] ,
+                            dot[Tensor[n_,__]] -> dot[Tensor[n,__]]}, 1];
+  eqs      = FilterRules[lookup[calc, Equations], patterns];
+  calc3    = mapReplace[calc2, Equations, eqs];
+  calc3
+];
+
+
+
+(******************************************************************************)
+(* Convert from ADMBase *)
+(******************************************************************************)
+
+convertFromADMBaseCalc =
+{
+  Name -> thorn <> "_convertFromADMBase",
+  Schedule -> {"AT initial AFTER ADMBase_PostInitial"},
+  ConditionalOnKeyword -> {"my_initial_data", "ADMBase"},
+  Shorthands -> {g[la,lb], detg, gu[ua,ub], em4phi},
+  Equations -> 
+  {
+    g[la,lb]  -> admg[la,lb],
+    detg      -> detgExpr,
+    gu[ua,ub] -> 1/detg detgExpr MatrixInverse [g[ua,ub]],
+    
+    phi       -> IfThen[conformalMethod==CMW, detg^(-1/6), Log[detg]/12],
+    em4phi    -> IfThen[conformalMethod==CMW, phi^2, Exp[-4 phi]],
+    gt[la,lb] -> em4phi g[la,lb],
+    
+    trK       -> gu[ua,ub] admK[la,lb],
+    At[la,lb] -> em4phi (admK[la,lb] - (1/3) g[la,lb] trK),
+    
+    alpha     -> admalpha,
+    
+    beta[ua]  -> admbeta[ua],
+
+    IfCCZ4[Theta -> 0]
+  }
+};
+
+convertFromADMBaseGammaCalc =
+{
+  Name -> thorn <> "_convertFromADMBaseGamma",
+  Schedule -> {"AT initial AFTER " <> thorn <> "_convertFromADMBase"},
+  ConditionalOnKeyword -> {"my_initial_data", "ADMBase"},
+  (*
+  Where -> InteriorNoSync,
+  *)
+  (* Do not synchronise right after this routine; instead, synchronise
+     after extrapolating *)
+  Where -> Interior,
+  (* Synchronise after this routine, so that the refinement boundaries
+     are set correctly before extrapolating.  (We will need to
+     synchronise again after extrapolating because extrapolation does
+     not fill ghost zones, but this is irrelevant here.)  *)
+  Shorthands -> {dir[ua],
+                 detgt, gtu[ua,ub], Gt[ua,lb,lc], theta},
+  Equations -> 
+  {
+    dir[ua] -> Sign[beta[ua]],
+    
+    detgt        -> 1 (* detgtExpr *),
+    gtu[ua,ub]   -> 1/detgt detgtExpr MatrixInverse [gt[ua,ub]],
+    Gt[ua,lb,lc] -> 1/2 gtu[ua,ud]
+                    (PD[gt[lb,ld],lc] + PD[gt[lc,ld],lb] - PD[gt[lb,lc],ld]),
+    Xt[ua] -> gtu[ub,uc] Gt[ua,lb,lc],
+    
+(*
+    A -> - admdtalpha / (harmonicF alpha^harmonicN) (LapseAdvectionCoeff - 1),
+*)
+    (* If LapseACoeff=0, then A is not evolved, in the sense that it
+       does not influence the time evolution of other variables.  *)
+    (*A -> IfThen [LapseACoeff != 0,
+                 1 / (- harmonicF alpha^harmonicN)
+                 (+ admdtalpha
+                  - LapseAdvectionCoeff Upwind[beta[ua], alpha, la]),
+                 0],*)
+    
+    theta -> thetaExpr,
+    
+    (* If ShiftBCoeff=0 or theta ShiftGammaCoeff=0, then B^i is not
+       evolved, in the sense that it does not influence the time
+       evolution of other variables.  *)
+    B[ua] -> IfThen [ShiftGammaCoeff ShiftBCoeff != 0,
+                     1 / (theta ShiftGammaCoeff)
+                     (+ admdtbeta[ua]
+                      - ShiftAdvectionCoeff Upwind[beta[ub], beta[ua], lb]),
+                     0]
+  }
+};
+
+(* Initialise the Gamma variables to 0.  This is necessary with
+   multipatch because convertFromADMBaseGamma does not perform the
+   conversion in the boundary points, and the order in which symmetry
+   (interpatch) and outer boundary conditions is applied means that
+   points which are both interpatch and symmetry points are never
+   initialised. *)
+initGammaCalc =
+{
+  Name -> thorn <> "_InitGamma",
+  Schedule -> {"AT initial BEFORE " <> thorn <> "_convertFromADMBaseGamma"},
+  ConditionalOnKeyword -> {"my_initial_data", "ADMBase"},
+  Where -> Everywhere,
+  Equations -> 
+  {
+    Xt[ua] -> 0,
+    (*A      -> 0,*)
+    B[ua]  -> 0
+  }
+};
+
+
+
+(******************************************************************************)
+(* Convert to ADMBase *)
+(******************************************************************************)
+
+convertToADMBaseCalc =
+{
+  Name -> thorn <> "_convertToADMBase",
+  Schedule -> {"IN " <> thorn <> "_convertToADMBaseGroup"},
+  Where -> Everywhere,
+  Shorthands -> {e4phi},
+  Equations -> 
+  {
+    e4phi       -> IfThen[conformalMethod==CMW, 1/phi^2, Exp[4 phi]],
+    admg[la,lb] -> e4phi gt[la,lb],
+    admK[la,lb] -> e4phi At[la,lb] + (1/3) admg[la,lb] trK,
+    admalpha    -> alpha,
+    admbeta[ua] -> beta[ua]
+  }
+};
+
+convertToADMBaseDtLapseShiftCalc =
+{
+  Name -> thorn <> "_convertToADMBaseDtLapseShift",
+  Schedule -> {"IN " <> thorn <> "_convertToADMBaseGroup"},
+  ConditionalOnKeyword -> {"dt_lapse_shift_method", "correct"},
+  Where -> Interior,
+  Shorthands -> {dir[ua], detgt, gtu[ua,ub], eta, theta, em4phi, Ddetgt[la]},
+  Equations -> 
+  {
+    dir[ua] -> Sign[beta[ua]],
+    
+    detgt -> 1 (* detgtExpr *),
+    (* This leads to simpler code... *)
+    gtu[ua,ub]   -> 1/detgt detgtExpr MatrixInverse [gt[ua,ub]],
+    em4phi       -> IfThen[conformalMethod==CMW, phi^2, Exp[-4 phi]],
+    
+    eta -> etaExpr,
+    theta -> thetaExpr,
+
+    (* Ddetgt should be zero analytically, but we're not assuming it here. Change commenting to assume it.*)
+    Ddetgt[la] -> gtu[uk,ul] PD[gt[lk,ll],la],
+    (*Ddetgt[la] -> 0,*)
+ 
+    (* see RHS *)
+(*
+    admdtalpha -> - harmonicF alpha^harmonicN
+                    ((1 - LapseAdvectionCoeff) A + LapseAdvectionCoeff trK)
+                  + LapseAdvectionCoeff beta[ua] PDu[alpha,la],
+    admdtalpha -> - harmonicF alpha^harmonicN
+                    (+ LapseACoeff       A
+                     + ((1 - LapseACoeff)
+                        (trK - IfCCZ4[2 Theta, 0])))
+                  + LapseAdvectionCoeff Upwind[beta[ua], alpha, la],
+*)
+    admdtbeta[ua] -> IfThen[harmonicShift,
+                            - 1/2 gtu[ua,uj] em4phi alpha
+                              (- 2 alpha IfThen[conformalMethod==CMW,1/phi,-2] PD[phi,lj]
+                               + 2 PD[alpha,lj]
+                               + alpha (Ddetgt[lj] - 2 gtu[uk,ul] PD[gt[lj,lk],ll])),
+                            (* else *)
+                            + theta ShiftGammaCoeff
+                              (+ ShiftBCoeff B[ua]
+                               + (1 - ShiftBCoeff)
+                                 (Xt[ua] - eta BetaDriver beta[ua]))]
+                     + ShiftAdvectionCoeff Upwind[beta[ub], beta[ua], lb]
+  }
+};
+
+convertToADMBaseDtLapseShiftBoundaryCalc =
+{
+  Name -> thorn <> "_convertToADMBaseDtLapseShiftBoundary",
+  Schedule -> {"IN " <> thorn <> "_convertToADMBaseGroup"},
+  ConditionalOnKeyword -> {"dt_lapse_shift_method", "correct"},
+  Where -> BoundaryWithGhosts,
+  Shorthands -> {detgt, gtu[ua,ub], eta, theta},
+  Equations ->
+  {
+    detgt -> 1 (* detgtExpr *),
+    (* This leads to simpler code... *)
+    gtu[ua,ub]   -> 1/detgt detgtExpr MatrixInverse [gt[ua,ub]],
+    
+    eta -> etaExpr,
+    theta -> thetaExpr,
+    
+    (* see RHS, but omit derivatives near the boundary *)
+(*
+    admdtalpha -> - harmonicF alpha^harmonicN
+                    ((1 - LapseAdvectionCoeff) A + LapseAdvectionCoeff trK),
+    admdtalpha -> - harmonicF alpha^harmonicN
+                    (+ LapseACoeff       A
+                     + ((1 - LapseACoeff)
+                        (trK - IfCCZ4[2 Theta, 0]))),
+*)
+    admdtbeta[ua] -> IfThen[harmonicShift,
+                            0,
+                            (* else *)
+                            + theta ShiftGammaCoeff
+                              (+ ShiftBCoeff B[ua]
+                               + (1 - ShiftBCoeff)
+                                 (Xt[ua] - eta BetaDriver beta[ua]))]
+  }
+};
+
+convertToADMBaseFakeDtLapseShiftCalc =
+{
+  Name -> thorn <> "_convertToADMBaseFakeDtLapseShift",
+  Schedule -> {"IN " <> thorn <> "_convertToADMBaseGroup"},
+  ConditionalOnKeyword -> {"dt_lapse_shift_method", "noLapseShiftAdvection"},
+  Where -> Everywhere,
+  Shorthands -> {detgt, gtu[ua,ub], eta, theta},
+  Equations ->
+  {
+    detgt -> 1 (* detgtExpr *),
+    (* This leads to simpler code... *)
+    gtu[ua,ub]   -> 1/detgt detgtExpr MatrixInverse [gt[ua,ub]],
+    
+    eta -> etaExpr,
+    theta -> thetaExpr,
+    
+    (* see RHS, but omit derivatives everywhere (which is wrong, but
+       faster, since it does not require synchronisation or boundary
+       conditions) *)
+(*
+    admdtalpha -> - harmonicF alpha^harmonicN
+                    ((1 - LapseAdvectionCoeff) A + LapseAdvectionCoeff trK),
+    admdtalpha -> - harmonicF alpha^harmonicN
+                    (+ LapseACoeff       A
+                     + ((1 - LapseACoeff)
+                        (trK - IfCCZ4[2 Theta, 0]))),
+*)
+    admdtbeta[ua] -> IfThen[harmonicShift,
+                            0,
+                            (* else *)
+                            + theta ShiftGammaCoeff
+                              (+ ShiftBCoeff B[ua]
+                               + (1 - ShiftBCoeff)
+                                 (Xt[ua] - eta BetaDriver beta[ua]))]
+  }
+};
+
+(******************************************************************************)
+(* Evolution equations *)
+(******************************************************************************)
+
+evolCalc =
+{
+  Name -> thorn <> "_RHS",
+  Schedule -> {"IN " <> thorn <> "_evolCalcGroup"},
+  (*
+  Where -> Interior,
+  *)
+  (* Synchronise the RHS grid functions after this routine, so that
+     the refinement boundaries are set correctly before applying the
+     radiative boundary conditions.  *)
+  Where -> InteriorNoSync,
+  Shorthands -> {dir[ua],
+                 detgt, gtu[ua,ub],
+                 Gt[ua,lb,lc], Gtl[la,lb,lc], Gtlu[la,lb,uc], G[ua, lb, lc], ddetg[la], Xtn[ua],
+                 Rt[la,lb], Rphi[la,lb], R[la,lb],
+                 Atm[ua,lb], Atu[ua,ub],
+                 e4phi, em4phi, cdphi[la], cdphi2[la,lb], g[la,lb], detg,
+                 gu[ua,ub], Ats[la,lb], trAts, eta, theta,
+                 K[la, lb], Km[la, ub],
+                 rho, S[la], trS, fac1, fac2, dottrK, dotXt[ua],
+                 epsdiss[ua], IfCCZ4[Z[ua]], IfCCZ4[dotTheta], Ddetgt[la]},
+  Equations -> 
+  {
+    dir[ua] -> Sign[beta[ua]],
+    
+    detgt -> 1 (* detgtExpr *),
+    
+    (* This leads to simpler code... *)
+    gtu[ua,ub]   -> 1/detgt detgtExpr MatrixInverse [gt[ua,ub]],
+    Gtl[la,lb,lc]  -> 1/2
+                      (PD[gt[lb,la],lc] + PD[gt[lc,la],lb] - PD[gt[lb,lc],la]),
+    Gtlu[la,lb,uc] -> gtu[uc,ud] Gtl[la,lb,ld],
+    Gt[ua,lb,lc]   -> gtu[ua,ud] Gtl[ld,lb,lc],
+ 
+    (* The conformal connection functions calculated from the conformal metric,
+       used instead of Xt where no derivatives of Xt are taken *)
+    Xtn[ui] -> gtu[uj,uk] Gt[ui,lj,lk],
+
+    e4phi       -> IfThen[conformalMethod==CMW, 1/phi^2, Exp[4 phi]],
+    em4phi      -> 1 / e4phi,
+    g[la,lb]    -> e4phi gt[la,lb],
+    detg        -> detgExpr,
+    gu[ua,ub]   -> em4phi gtu[ua,ub],
+    ddetg[la]   -> 4 detgt e4phi PD[phi,la],
+    G[ua,lb,lc] -> Gt[ua,lb,lc]
+                   + 1/(2 detg) (+ KD[ua,lb] ddetg[lc] + KD[ua,lc] ddetg[lb]
+                                 - (1/3) g[lb,lc] gu[ua,ud] ddetg[ld]),
+    K[la, lb]   -> e4phi At[la, lb] + (1/3) g[la, lb] trK,
+    Km[la, ub]  -> gu[ub, uc] K[la, lc],
+
+    (* The Z quantities *)
+    (* gr-qc:1106.2254 (2011), eqn. (23) *)
+    IfCCZ4[
+      Z[ud] -> (1/2) gu[ua,ud] (- PD[gt[la,lb],lc] gtu[ub,uc] + gt[la,lc] Xt[uc])
+    ],
+
+    (* PRD 62, 044034 (2000), eqn. (18) *)
+    (* Adding Z term by changing Xtn to Xt *)
+    Rt[li,lj] -> - (1/2) gtu[ul,um] PD[gt[li,lj],ll,lm]
+                 + (1/2) gt[lk,li] PD[Xt[uk],lj]
+                 + (1/2) gt[lk,lj] PD[Xt[uk],li]
+                 + (1/2) Xtn[uk] Gtl[li,lj,lk]
+                 + (1/2) Xtn[uk] Gtl[lj,li,lk]
+                 + (+ Gt[uk,li,ll] Gtlu[lj,lk,ul]
+                    + Gt[uk,lj,ll] Gtlu[li,lk,ul]
+                    + Gt[uk,li,ll] Gtlu[lk,lj,ul]),
+
+    fac1 -> IfThen[conformalMethod==CMW, -1/(2 phi), 1],
+    cdphi[la] -> fac1 CDt[phi,la],
+    fac2 -> IfThen[conformalMethod==CMW, 1/(2 phi^2), 0],
+    cdphi2[la,lb] -> fac1 CDt[phi,la,lb] + fac2 CDt[phi,la] CDt[phi,lb],
+
+    (* PRD 62, 044034 (2000), eqn. (15) *)
+    Rphi[li,lj] -> - 2 cdphi2[lj,li]
+                   - 2 gt[li,lj] gtu[ul,un] cdphi2[ll,ln]
+                   + 4 cdphi[li] cdphi[lj]
+                   - 4 gt[li,lj] gtu[ul,un] cdphi[ln] cdphi[ll],
+    
+    Atm[ua,lb] -> gtu[ua,uc] At[lc,lb],
+    Atu[ua,ub] -> Atm[ua,lc] gtu[ub,uc],
+    
+    R[la,lb] -> Rt[la,lb] + Rphi[la,lb],
+    IfCCZ4[
+      R[la,lb] -> R[la,lb] + (2/phi) (+ g[la,lc] Z[uc] PD[phi,lb]
+        + g[lb,lc] Z[uc] PD[phi,la] - g[la,lb] Z[uc] PD[phi,lc])
+        + e4phi Z[uc] PD[gt[la,lb],lc]
+    ],
+    
+    (* Matter terms *)
+    
+    (* rho = n^a n^b T_ab *)
+    rho -> addMatter
+           (1/alpha^2 (T00 - 2 beta[ui] T0[li] + beta[ui] beta[uj] T[li,lj])),
+    
+    (* S_i = -p^a_i n^b T_ab, where p^a_i = delta^a_i + n^a n_i *)
+    S[li] -> addMatter (-1/alpha (T0[li] - beta[uj] T[li,lj])),
+    
+    (* trS = gamma^ij T_ij  *)
+    trS -> addMatter (em4phi gtu[ui,uj] T[li,lj]),
+    
+    (* RHS terms *)
+    
+    (* PRD 62, 044034 (2000), eqn. (10) *)
+    (* PRD 67 084023 (2003), eqn. (16) and (23) *)
+    dot[phi]       -> IfThen[conformalMethod==CMW, 1/3 phi, -1/6]
+                      (alpha trK - PD[beta[ua],la]),
+    phidot -> IfThen[conformalMethod==CMW, 1/3 phi, -1/6]
+                      (alpha trK - PD[beta[ua],la]),
+    
+    (* PRD 62, 044034 (2000), eqn. (9) *)
+    (* gr-qc:1106.2254 (2011), eqn. (14) *)
+    (* removing trA from Aij ensures that detg = 1 *)
+    dot[gt[la,lb]] -> - 2 alpha (At[la,lb] - IfCCZ4[(1/3) At[lc,ld] gtu[uc,ud] gt[la,lb], 0])
+                      + gt[la,lc] PD[beta[uc],lb] + gt[lb,lc] PD[beta[uc],la]
+                      - (2/3) gt[la,lb] PD[beta[uc],lc],
+    (* PRD 62, 044034 (2000), eqn. (20) *)
+    (* PRD 67 084023 (2003), eqn (26) *)
+    (* gr-qc:1106.2254 (2011), eqn. (19) *)
+    (* Adding Z terms by changing Xtn to Xt,
+       also adding extra Z and Theta terms *)
+    dotXt[ui]      -> - 2 Atu[ui,uj] PD[alpha,lj]
+                      + 2 alpha (+ Gt[ui,lj,lk] Atu[uk,uj]
+                                 - (2/3) gtu[ui,uj] PD[trK,lj]
+                                 + 6 Atu[ui,uj] cdphi[lj])
+                      + gtu[uj,ul] PD[beta[ui],lj,ll]
+                      + (1/3) gtu[ui,uj] PD[beta[ul],lj,ll]
+                      - Xtn[uj] PD[beta[ui],lj] 
+                      + (2/3) Xtn[ui] PD[beta[uj],lj]
+                      + IfCCZ4[
+                               + GammaShift 2 e4phi (- Z[uj] PD[beta[ui],lj]
+                                                 + (2/3) Z[ui] PD[beta[uj],lj])
+                               - (4/3) alpha e4phi Z[ui] trK
+                               + 2 gtu[ui,uj] (+ alpha PD[Theta,lj]
+                                               - Theta PD[alpha,lj])
+                               - 2 alpha e4phi dampk1 Z[ui],
+                        0]
+    (* Equation (4.28) in Baumgarte & Shapiro (Phys. Rept. 376 (2003) 41-131) *)
+                      + addMatter (- 16 Pi alpha gtu[ui,uj] S[lj]),
+    dot[Xt[ui]]    -> dotXt[ui],
+    Xtdot[ui] -> dotXt[ui],
+
+    (* gr-qc:1106.2254 (2011), eqn. (18) *)
+    IfCCZ4[
+      dotTheta ->
+        - PD[alpha,la] Z[ua] - dampk1 (2 + dampk2) alpha Theta
+        + (1/2) alpha (gu[ua,ub] R[la,lb] - Atm[ua,lb] Atm[ub,la] + (2/3) trK^2 - 2 trK Theta) 
+        + addMatter (- 8 Pi alpha rho)
+    ],
+
+    IfCCZ4[
+      dot[Theta] -> dotTheta
+    ],
+
+    (* PRD 62, 044034 (2000), eqn. (11) *)
+    (* gr-qc:1106.2254 (2011), eqn. (17) *)
+    (* Adding the RHS of Theta to K, because K_Z4 = K_BSSN + 2 Theta *)
+    (* Also adding the Z term, as it has to cancel with the one in Theta *)
+    (*dottrK         -> - em4phi ( gtu[ua,ub] ( PD[alpha,la,lb]
+                                + 2 cdphi[la] PD[alpha,lb] )
+                                - Xtn[ua] PD[alpha,la] )
+                      + alpha (Atm[ua,lb] Atm[ub,la] + (1/3) trK^2)
+                      + IfCCZ4[
+                               + 2 dotTheta + 2 PD[alpha,la] Z[ua]
+                               + dampk1 (1 - dampk2) alpha Theta,
+                               0]*)
+    term1         -> - em4phi ( gtu[ua,ub] ( PD[alpha,la,lb]
+                                + 2 cdphi[la] PD[alpha,lb] )
+                                - Xtn[ua] PD[alpha,la] ),
+    term2         ->  + alpha (Atm[ua,lb] Atm[ub,la] + (1/3) trK^2),
+    term3         ->  IfCCZ4[+ 2 dotTheta + 2 PD[alpha,la] Z[ua]
+                               + dampk1 (1 - dampk2) alpha Theta, 0],
+    dottrK        -> term1 + term2 + term3
+    (* Equation (4.21) in Baumgarte & Shapiro (Phys. Rept. 376 (2003) 41-131) *)
+                      + addMatter (4 Pi alpha (rho + trS)),
+    dot[trK]       -> KEvolFactor dottrK,
+
+    (* PRD 62, 044034 (2000), eqn. (12) *)
+    (* TODO: Should we use the Hamiltonian constraint to make Rij tracefree? *)
+    (* gr-qc:1106.2254 (2011), eqn. (15) *)
+    (* Adding Z terms in the Ricci and Theta terms *)
+    Ats[la,lb]     -> - CDt[alpha,la,lb] +
+                      + 2 (PD[alpha,la] cdphi[lb] + PD[alpha,lb] cdphi[la] )
+                      + alpha R[la,lb],
+    trAts          -> gu[ua,ub] Ats[la,lb],
+    dot[At[la,lb]] -> + em4phi (+ Ats[la,lb] - (1/3) g[la,lb] trAts )
+                      + alpha (+ ((trK - IfCCZ4[2 Theta, 0])
+                                  At[la,lb])
+                               - 2 At[la,lc] Atm[uc,lb])
+                      + At[la,lc] PD[beta[uc],lb] + At[lb,lc] PD[beta[uc],la]
+                      - (2/3) At[la,lb] PD[beta[uc],lc]
+    (* Equation (4.23) in Baumgarte & Shapiro (Phys. Rept. 376 (2003) 41-131) *)
+                      + addMatter (- em4phi alpha 8 Pi
+                                     (T[la,lb] - (1/3) g[la,lb] trS)),
+    Kdot[la, lb] -> -CD[alpha, la, lb] + alpha (R[la, lb] + trK K[la, lb] - 2 K[la, lc] Km[lb, uc]),
+    
+    
+    eta -> etaExpr,
+    theta -> thetaExpr,
+
+    (* Ddetgt should be zero analytically, but we're not assuming it here. Change commenting to assume it.*)
+    Ddetgt[la] -> gtu[uk,ul] PD[gt[lk,ll],la],
+    (*Ddetgt[la] -> 0,*)
+
+    (* dot[beta[ua]] -> eta Xt[ua], *)
+    (* dot[beta[ua]] -> ShiftGammaCoeff alpha^ShiftAlphaPower B[ua], *)
+    dot[beta[ua]] -> IfThen[harmonicShift,
+                            - 1/2 gtu[ua,uj] em4phi alpha
+                              (- 2 alpha IfThen[conformalMethod==CMW,1/phi,-2] PD[phi,lj]
+                               + 2 PD[alpha,lj]
+                               + alpha (Ddetgt[lj] - 2 gtu[uk,ul] PD[gt[lj,lk],ll])),
+                            (* else *)
+                            + theta ShiftGammaCoeff
+                              (+ ShiftBCoeff B[ua]
+                               + (1 - ShiftBCoeff)
+                                 (Xt[ua] - eta BetaDriver beta[ua]))],
+
+    dot[B[ua]]    -> + ShiftBCoeff (dotXt[ua] - eta BetaDriver B[ua])
+      (* Note that this dotXt[ua] is not yet \partial_t \tilde \Gamma^i, because the
+         advection term has not yet been added.  It is actually
+         \partial_t \tilde \Gamma^i - \beta^j \partial_j \tilde \Gamma^i *)
+  }
+};
+
+lapseEvolCalc = {
+  Name -> thorn <> "_lapse_evol",
+  Schedule -> {"IN " <> thorn <> "_evolCalcGroup"},
+  (*
+  Where -> Interior,
+  *)
+  (* Synchronise the RHS grid functions after this routine, so that
+     the refinement boundaries are set correctly before applying the
+     radiative boundary conditions.  *)
+  Where -> InteriorNoSync,
+  Shorthands -> {},
+  Equations ->
+  {
+    dot[alpha] -> - harmonicF alpha^harmonicN (+ trK - IfCCZ4[2 Theta, 0] + AlphaDriver (alpha - 1)) + WFactor qmsw
+  }
+};
+
+advectCalc =
+{
+  Name -> thorn <> "_Advect",
+  Schedule -> {"IN " <> thorn <> "_evolCalcGroup " <>
+               "AFTER (" <> thorn <> "_RHS " <> thorn <> "_lapse_evol " <> thorn <> "_RHS1 " <> thorn <> "_RHS2)"},
+  (*
+  Where -> Interior,
+  *)
+  (* Synchronise the RHS grid functions after this routine, so that
+     the refinement boundaries are set correctly before applying the
+     radiative boundary conditions.  *)
+  Where -> InteriorNoSync,
+  Shorthands -> {dir[ua]},
+  Equations ->
+  {
+           dir[ua] -> Sign[beta[ua]],
+
+          dot[phi] -> dot[phi] + Upwind[beta[ua], phi, la],
+
+    dot[gt[la,lb]] -> dot[gt[la,lb]] + Upwind[beta[uc], gt[la,lb], lc],
+
+       dot[Xt[ui]] -> dot[Xt[ui]] + Upwind[beta[uj], Xt[ui], lj],
+
+    IfCCZ4[
+        dot[Theta] -> dot[Theta] + Upwind[beta[ua], Theta, la]
+    ],
+
+          dot[trK] -> dot[trK] + Upwind[beta[ua], trK, la],
+
+    dot[At[la,lb]] -> dot[At[la,lb]] + Upwind[beta[uc], At[la,lb], lc],
+
+        (*
+        dot[alpha] -> dot[alpha]
+                      + LapseAdvectionCoeff Upwind[beta[ua], alpha, la],
+
+            dot[A] -> dot[A]
+                      + LapseACoeff (
+                        + LapseAdvectionCoeff       Upwind[beta[ua], A, la]
+                        + (1 - LapseAdvectionCoeff) Upwind[beta[ua], trK, la]),
+    *)
+
+     dot[beta[ua]] -> dot[beta[ua]]
+                      + ShiftAdvectionCoeff Upwind[beta[ub], beta[ua], lb],
+
+        dot[B[ua]] -> dot[B[ua]]
+                      + ShiftBCoeff (
+                        + ShiftAdvectionCoeff Upwind[beta[ub], B[ua], lb]
+                        + ((1 - ShiftAdvectionCoeff)
+                           Upwind[beta[ub], Xt[ua], lb]))
+                      (* Note that the advection term \beta^j \partial_j \tilde \Gamma^i is not
+                         subtracted here when ShiftAdvectionCoefficient == 1 because it was
+                         implicitly subtracted before (see comment in previous calculation of
+                         dot[B[ua]]. *)
+  }
+};
+
+evolCalc1 = PartialCalculation[evolCalc, "1",
+  {
+    ConditionalOnKeyword -> {"RHS_calculation", "split"}
+  },
+  {
+    dot[phi],
+    dot[gt[la,lb]],
+    dot[Xt[ui]],
+    term1, term2, term3,
+    dot[trK],
+    dot[beta[ua]],
+    dot[B[ua]],
+    IfCCZ4[dot[Theta]]
+  }];
+
+evolCalc2 = PartialCalculation[evolCalc, "2",
+  {
+    ConditionalOnKeyword -> {"RHS_calculation", "split"}
+  },
+  {
+    dot[At[la,lb]]
+  }];
+
+dissCalc =
+{
+  Name -> thorn <> "_Dissipation",
+  Schedule -> {"IN " <> thorn <> "_evolCalcGroup " <>
+               "AFTER (" <> thorn <> "_RHS " <> thorn <> "_RHS1 " <> thorn <> "_RHS2)"},
+  ConditionalOnKeyword -> {"apply_dissipation", "always"},
+  Where -> InteriorNoSync,
+  Shorthands -> {epsdiss[ua]},
+  Equations ->
+  {
+    epsdiss[ua] -> EpsDiss,
+    Sequence@@Table[
+      dot[var]       -> dot[var] + epsdiss[ux] PDdiss[var,lx],
+      {var, {phi, gt[la,lb], Xt[ui], IfCCZ4[Theta], trK, At[la,lb],
+       (*alpha, A,*) beta[ua], B[ua]}}]
+  }
+};
+
+dissCalcs =
+Table[
+{
+  Name -> thorn <> "_Dissipation_" <> ToString[var /. {Tensor[n_,__] -> n}],
+  Schedule -> {"IN " <> thorn <> "_evolCalcGroup " <>
+               "AFTER (" <> thorn <> "_RHS " <> thorn <> "_RHS1 " <> thorn <> "_RHS2)"},
+  ConditionalOnKeyword -> {"apply_dissipation", "always"},
+  Where -> InteriorNoSync,
+  Shorthands -> {epsdiss[ua]},
+  Equations ->
+  {
+    epsdiss[ua] -> EpsDiss,
+    dot[var]    -> dot[var] + epsdiss[ux] PDdiss[var,lx]
+  }
+},
+  {var, {phi, gt[la,lb], Xt[ui], IfCCZ4[Theta], trK, At[la,lb],
+    (*alpha, A,*) beta[ua], B[ua]}}
+];
+
+RHSStaticBoundaryCalc =
+{
+  Name -> thorn <> "_RHSStaticBoundary",
+  Schedule -> {"IN MoL_CalcRHS"},
+  ConditionalOnKeyword -> {"my_rhs_boundary_condition", "static"},
+  Where -> Boundary,
+  Equations -> 
+  {
+    dot[phi]       -> 0,
+    dot[gt[la,lb]] -> 0,
+    dot[trK]       -> 0,
+    dot[At[la,lb]] -> 0,
+    dot[Xt[ua]]    -> 0,
+    (*dot[alpha]     -> 0,
+    dot[A]         -> 0,*)
+    dot[beta[ua]]  -> 0,
+    dot[B[ua]]     -> 0,
+    IfCCZ4[dot[Theta] -> 0]
+  }
+};
+
+(* Initialise the RHS variables in analysis in case they are going to
+   be output - the noninterior points cannot be filled, so we define
+   them to be zero *)
+initRHSCalc =
+{
+  Name -> thorn <> "_InitRHS",
+  Schedule -> {"AT analysis BEFORE " <> thorn <> "_evolCalcGroup"},
+  Where -> Everywhere,
+  Equations -> 
+  {
+    dot[phi]       -> 0,
+    dot[gt[la,lb]] -> 0,
+    dot[trK]       -> 0,
+    dot[At[la,lb]] -> 0,
+    dot[Xt[ua]]    -> 0,
+    (*dot[alpha]     -> 0,
+    dot[A]         -> 0,*)
+    dot[beta[ua]]  -> 0,
+    dot[B[ua]]     -> 0,
+    IfCCZ4[dot[Theta] -> 0]
+  }
+};
+
+RHSRadiativeBoundaryCalc =
+{
+  Name -> thorn <> "_RHSRadiativeBoundary",
+  Schedule -> {"IN MoL_CalcRHS"},
+  ConditionalOnKeyword -> {"my_rhs_boundary_condition", "radiative"},
+  Where -> Boundary,
+  Shorthands -> {dir[ua],
+                 detgt, gtu[ua,ub], em4phi, gu[ua,ub],
+                 nn[la], nu[ua], nlen, nlen2, su[ua],
+                 vg},
+  Equations -> 
+  {
+    dir[ua] -> Sign[normal[ua]],
+    
+    detgt      -> 1 (* detgtExpr *),
+    gtu[ua,ub] -> 1/detgt detgtExpr MatrixInverse [gt[ua,ub]],
+    em4phi     -> IfThen[conformalMethod==CMW, phi^2, Exp[-4 phi]],
+    gu[ua,ub]  -> em4phi gtu[ua,ub],
+    
+    nn[la] -> Euc[la,lb] normal[ub],
+    nu[ua] -> gu[ua,ub] nn[lb],
+    nlen2  -> nu[ua] nn[la],
+    nlen   -> Sqrt[nlen2],
+    su[ua] -> nu[ua] / nlen,
+    
+    vg -> Sqrt[harmonicF],
+    
+    dot[phi]       -> - vg su[uc] PDo[phi      ,lc],
+    dot[gt[la,lb]] -> -    su[uc] PDo[gt[la,lb],lc],
+    dot[trK]       -> - vg su[uc] PDo[trK      ,lc],
+    dot[At[la,lb]] -> -    su[uc] PDo[At[la,lb],lc],
+    dot[Xt[ua]]    -> -    su[uc] PDo[Xt[ua]   ,lc],
+    (*dot[alpha]     -> - vg su[uc] PDo[alpha    ,lc],
+    dot[A]         -> - vg su[uc] PDo[A        ,lc],*)
+    dot[beta[ua]]  -> -    su[uc] PDo[beta[ua] ,lc],
+    dot[B[ua]]     -> -    su[uc] PDo[B[ua]    ,lc],
+    IfCCZ4[
+      dot[Theta]   -> - vg su[uc] PDo[Theta    ,lc]
+    ]
+  }
+};
+
+enforceCalc =
+{
+  Name -> thorn <> "_enforce",
+  Schedule -> {"IN MoL_PostStepModify"},
+  Shorthands -> {detgt, gtu[ua,ub], trAt},
+  Equations -> 
+  {
+    (* The following comment is still interesting, but is not correct
+       any more since it is now scheduled in MoL_PostStepModify instead:
+
+       Enforcing the constraints needs to be a projection, because it
+       is applied in MoL_PostStep and may thus be applied multiple
+       times, not only during time evolution. Therefore detgt has to
+       be calculated correctly, without assuming that det gt_ij = 1,
+       which is not always the case (since we don't enforce it). On
+       the other hand, this may not be so important... *)
+    detgt -> 1 (* detgtExpr *),
+    gtu[ua,ub] -> 1/detgt detgtExpr MatrixInverse [gt[ua,ub]],
+    
+    trAt -> gtu[ua,ub] At[la,lb],
+    
+    At[la,lb] -> At[la,lb] - (1/3) gt[la,lb] trAt(*,
+    
+    alpha -> Max[alpha, MinimumLapse]*)
+  }
+};
+
+(******************************************************************************)
+(* Boundary conditions *)
+(******************************************************************************)
+
+boundaryCalc =
+{
+  Name -> thorn <> "_boundary",
+  Schedule -> {"IN MoL_PostStep"},
+  ConditionalOnKeyword -> {"my_boundary_condition", "Minkowski"},
+  Where -> BoundaryWithGhosts,
+  Equations -> 
+  {
+    phi       -> IfThen[conformalMethod==CMW, 1, 0],
+    gt[la,lb] -> KD[la,lb],
+    trK       -> 0,
+    At[la,lb] -> 0,
+    Xt[ua]    -> 0,
+    (*alpha     -> 1,
+    A         -> 0,*)
+    beta[ua]  -> 0,
+    B[ua]     -> 0,
+    IfCCZ4[Theta -> 0]
+  }
+};
+
+(******************************************************************************)
+(* Constraint equations *)
+(******************************************************************************)
+
+constraintsCalc =
+{
+  Name -> thorn <> "_constraints",
+  Schedule -> Automatic,
+  After -> "MoL_PostStep",
+  Where -> Interior,
+  Shorthands -> {detgt, ddetgt[la], gtu[ua,ub], Z[ua],
+                 Gt[ua,lb,lc], Gtl[la,lb,lc], Gtlu[la,lb,uc], Xtn[ua],
+                 e4phi, em4phi,
+                 g[la,lb], detg, gu[ua,ub], ddetg[la], G[ua,lb,lc],
+                 Rt[la,lb], Rphi[la,lb], R[la,lb], trR, Atm[ua,lb],
+                 gK[la,lb,lc], cdphi[la], cdphi2[la,lb],
+                 rho, S[la], fac1, fac2},
+  Equations -> 
+  {
+    detgt        -> 1 (* detgtExpr *),
+    ddetgt[la]   -> 0 (* ddetgtExpr[la] *),
+    
+    (* This leads to simpler code... *)
+    gtu[ua,ub]   -> 1/detgt detgtExpr MatrixInverse [gt[ua,ub]],
+    Gtl[la,lb,lc]  -> 1/2
+                      (PD[gt[lb,la],lc] + PD[gt[lc,la],lb] - PD[gt[lb,lc],la]),
+    Gtlu[la,lb,uc] -> gtu[uc,ud] Gtl[la,lb,ld],
+    Gt[ua,lb,lc]   -> gtu[ua,ud] Gtl[ld,lb,lc],
+    
+    (* The conformal connection functions calculated from the conformal metric,
+       used instead of Xt where no derivatives of Xt are taken *)
+    Xtn[ui] -> gtu[uj,uk] Gt[ui,lj,lk],
+
+    e4phi       -> IfThen[conformalMethod==CMW, 1/phi^2, Exp[4 phi]],
+    em4phi      -> 1 / e4phi,
+    g[la,lb]    -> e4phi gt[la,lb],
+    detg        -> e4phi^3,
+    gu[ua,ub]   -> em4phi gtu[ua,ub],
+
+    (* The Z quantities *)
+    IfCCZ4[
+      Z[ud] -> (1/2) gu[ua,ud] (- PD[gt[la,lb],lc] gtu[ub,uc] + gt[la,lc] Xt[uc])
+    ],
+    
+    (* PRD 62, 044034 (2000), eqn. (18) *)
+    Rt[li,lj] -> - (1/2) gtu[ul,um] PD[gt[li,lj],ll,lm]
+                 + (1/2) gt[lk,li] PD[Xt[uk],lj]
+                 + (1/2) gt[lk,lj] PD[Xt[uk],li]
+                 + (1/2) Xtn[uk] Gtl[li,lj,lk]
+                 + (1/2) Xtn[uk] Gtl[lj,li,lk]
+                 + (+ Gt[uk,li,ll] Gtlu[lj,lk,ul]
+                    + Gt[uk,lj,ll] Gtlu[li,lk,ul]
+                    + Gt[uk,li,ll] Gtlu[lk,lj,ul]),
+
+    (* From the long turducken paper.
+       This expression seems to give the same result as the one from 044034.  *)
+    (* TODO: symmetrise correctly: (ij) = (1/2) [i+j] *)
+(*
+    Rt[li,lj] -> - (1/2) gtu[uk,ul] PD[gt[li,lj],lk,ll]
+                 + gt[lk,li] PD[Xt[uk],lj] + gt[lk,lj] PD[Xt[uk],li]
+                 + gt[li,ln] Gt[un,lj,lk] gtu[um,ua] gtu[uk,ub] PD[gt[la,lb],lm]
+                 + gt[lj,ln] Gt[un,li,lk] gtu[um,ua] gtu[uk,ub] PD[gt[la,lb],lm]
+                 + gtu[ul,us] (+ 2 Gt[uk,ll,li] gt[lj,ln] Gt[un,lk,ls]
+                               + 2 Gt[uk,ll,lj] gt[li,ln] Gt[un,lk,ls]
+                               + Gt[uk,li,ls] gt[lk,ln] Gt[un,ll,lj]),
+*)
+
+    (* Below would be a straightforward calculation,
+       without taking any Gamma^i into account.
+       This expression gives a different answer!  *)
+(*
+    Rt[la,lb] -> + Gt[u1,l2,la] Gt[l1,lb,u2] - Gt[u1,la,lb] Gt[l1,l2,u2]
+                 + 1/2 gtu[u1,u2] (- PD[gt[l1,l2],la,lb] + PD[gt[l1,la],l2,lb]
+                                   - PD[gt[la,lb],l1,l2] + PD[gt[l2,lb],l1,la]),
+*)
+
+    fac1 -> IfThen[conformalMethod==CMW, -1/(2 phi), 1],
+    cdphi[la] -> fac1 CDt[phi,la],
+    fac2 -> IfThen[conformalMethod==CMW, 1/(2 phi^2), 0],
+    cdphi2[la,lb] -> fac1 CDt[phi,la,lb] + fac2 CDt[phi,la] CDt[phi,lb],
+
+    (* PRD 62, 044034 (2000), eqn. (15) *)
+    Rphi[li,lj] -> - 2 cdphi2[lj,li]
+                   - 2 gt[li,lj] gtu[ul,un] cdphi2[ll,ln]
+                   + 4 cdphi[li] cdphi[lj]
+                   - 4 gt[li,lj] gtu[ul,un] cdphi[ln] cdphi[ll],
+    
+    (* ddetg[la] -> PD[e4phi detg,la], *)
+    ddetg[la]   -> e4phi ddetgt[la] + 4 detgt e4phi PD[phi,la],
+    (* TODO: check this equation, maybe simplify it by omitting ddetg *)
+    G[ua,lb,lc] -> Gt[ua,lb,lc]
+                   + 1/(2 detg) (+ KD[ua,lb] ddetg[lc] + KD[ua,lc] ddetg[lb]
+                                 - (1/3) g[lb,lc] gu[ua,ud] ddetg[ld]),
+    
+    R[la,lb] -> + Rt[la,lb] + Rphi[la,lb],
+
+    IfCCZ4[
+      R[la,lb] -> R[la, lb] + (2/phi) (+ g[la,lc] Z[uc] PD[phi,lb]
+        + g[lb,lc] Z[uc] PD[phi,la] - g[la,lb] Z[uc] PD[phi,lc])
+        + e4phi Z[uc] PD[gt[la,lb],lc]
+    ],
+
+    trR      -> gu[ua,ub] R[la,lb],
+    
+    (* K[la,lb] -> e4phi At[la,lb] + (1/3) g[la,lb] trK, *)
+    (* Km[ua,lb] -> gu[ua,uc] K[lc,lb], *)
+    Atm[ua,lb] -> gtu[ua,uc] At[lc,lb],
+    
+    (* Matter terms *)
+    
+    (* rho = n^a n^b T_ab *)
+    rho -> 1/alpha^2 (T00 - 2 beta[ui] T0[li] + beta[ui] beta[uj] T[li,lj]),
+    
+    (* S_i = -p^a_i n^b T_ab, where p^a_i = delta^a_i + n^a n_i *)
+    S[li] -> -1/alpha (T0[li] - beta[uj] T[li,lj]),
+    
+    (* Constraints *)
+    
+    (* H -> trR - Km[ua,lb] Km[ub,la] + trK^2, *)
+    (* PRD 67, 084023 (2003), eqn. (19) *)
+    H -> trR - Atm[ua,lb] Atm[ub,la] + (2/3) trK^2 - addMatter 16 Pi rho,
+    
+    (* gK[la,lb,lc] -> CD[K[la,lb],lc], *)
+(*    gK[la,lb,lc] -> + 4 e4phi PD[phi,lc] At[la,lb] + e4phi CD[At[la,lb],lc]
+                    + (1/3) g[la,lb] PD[trK,lc],
+
+    M[la] -> gu[ub,uc] (gK[lc,la,lb] - gK[lc,lb,la]), *)
+
+    M[li] -> + gtu[uj,uk] (CDt[At[li,lj],lk] + 6 At[li,lj] cdphi[lk])
+             - (2/3) PD[trK,li]
+             - addMatter 8 Pi S[li],
+    (* TODO: use PRD 67, 084023 (2003), eqn. (20) *)
+    
+    (* det gamma-tilde *)
+    cS -> Log[detgt],
+    
+    (* Gamma constraint *)
+    cXt[ua] -> gtu[ub,uc] Gt[ua,lb,lc] - Xt[ua],
+    
+    (* trace A-tilde *)
+    cA -> gtu[ua,ub] At[la,lb]
+  }
+};
+
+constraintsCalc1 = PartialCalculation[constraintsCalc, "1",
+  {},
+  {
+    H
+  }];
+
+constraintsCalc2 = PartialCalculation[constraintsCalc, "2",
+  {},
+  {
+    M[li],
+    cS,
+    cXt[ua],
+    cA
+  }];
+
+(******************************************************************************)
+(* Implementations *)
+(******************************************************************************)
+
+inheritedImplementations =
+  Join[{"ADMBase", "QuasiMaximalSlicing"},
+       If [addMatter!=0, {"TmunuBase"}, {}]];
+
+(******************************************************************************)
+(* Parameters *)
+(******************************************************************************)
+
+inheritedKeywordParameters = {};
+
+extendedKeywordParameters =
+{
+  {
+    Name -> "ADMBase::evolution_method",
+    AllowedValues -> {thorn}
+  },
+  {
+    Name -> "ADMBase::lapse_evolution_method",
+    AllowedValues -> {thorn}
+  },
+  {
+    Name -> "ADMBase::shift_evolution_method",
+    AllowedValues -> {thorn}
+  },
+  {
+    Name -> "ADMBase::dtlapse_evolution_method",
+    AllowedValues -> {thorn}
+  },
+  {
+    Name -> "ADMBase::dtshift_evolution_method",
+    AllowedValues -> {thorn}
+  }
+};
+
+keywordParameters =
+{
+  {
+    Name -> "my_initial_data",
+    (* Visibility -> "restricted", *)
+    (* Description -> "ddd", *)
+    AllowedValues -> {"ADMBase", "Minkowski"},
+    Default -> "ADMBase"
+  },
+  {
+    Name -> "my_initial_boundary_condition",
+    Visibility -> "restricted",
+    (* Description -> "ddd", *)
+    AllowedValues -> {"none"},
+    Default -> "none"
+  },
+  {
+    Name -> "my_rhs_boundary_condition",
+    Visibility -> "restricted",
+    (* Description -> "ddd", *)
+    AllowedValues -> {"none", "static", "radiative"},
+    Default -> "none"
+  },
+  {
+    Name -> "my_boundary_condition",
+    (* Visibility -> "restricted", *)
+    (* Description -> "ddd", *)
+    AllowedValues -> {"none", "Minkowski"},
+    Default -> "none"
+  },
+  {
+    Name -> "calculate_ADMBase_variables_at",
+    Visibility -> "restricted",
+    (* Description -> "ddd", *)
+    AllowedValues -> {"MoL_PostStep", "CCTK_EVOL", "CCTK_ANALYSIS"},
+    Default -> "MoL_PostStep"
+  },
+  {
+    Name -> "UseSpatialBetaDriver",
+    Visibility -> "restricted",
+    (* Description -> "ddd", *)
+    AllowedValues -> {"no", "yes"},
+    Default -> "no"
+  },
+  {
+    Name -> "dt_lapse_shift_method",
+    Description -> "Treatment of ADMBase dtlapse and dtshift",
+    AllowedValues -> {"correct",
+                      "noLapseShiftAdvection" (* omit lapse and shift advection terms (faster) *)
+                     },
+    Default -> "correct"
+  },
+  {
+    Name -> "apply_dissipation",
+    Description -> "Whether to apply dissipation to the RHSs",
+    AllowedValues -> {"always",
+                      "never" (* yes and no keyword values confuse Cactus, and Kranc
+                                 doesn't support boolean parameters *)
+                     },
+    Default -> "never"
+  }
+
+};
+
+intParameters =
+{
+  {
+    Name -> harmonicN,
+    Description -> "d/dt alpha = - f alpha^n K  (harmonic=2, 1+log=1)",
+    Default -> 2
+  },
+  {
+    Name -> ShiftAlphaPower,
+    Default -> 0
+  },
+  {
+    Name -> conformalMethod,
+    Description -> "Treatment of conformal factor",
+    AllowedValues -> {{Value -> "0", Description -> "phi method"},
+                      {Value -> "1", Description -> "W method"}},
+    Default -> 0
+  },
+  {
+    Name -> fdOrder,
+    Default -> derivOrder,
+    AllowedValues -> {2,4,6,8}
+  },
+  {
+    Name -> harmonicShift,
+    Description -> "Whether to use the harmonic shift",
+    AllowedValues -> {{Value -> "0", Description -> "Gamma driver shift"},
+                      {Value -> "1", Description -> "Harmonic shift"}},
+    Default -> 0
+  }
+};
+
+realParameters =
+{
+  IfCCZ4[{
+    Name -> GammaShift,
+    Description -> "Covariant shift term in Gamma",
+    Default -> 0.5
+  }],   
+  IfCCZ4[{
+    Name -> dampk1,
+    Description -> "CCZ4 damping term 1 for Theta and Z",
+    Default -> 0
+  }],
+  IfCCZ4[{
+    Name -> dampk2,
+    Description -> "CCZ4 damping term 2 for Theta and Z",
+    Default -> 0
+  }],
+  {
+    Name -> LapseACoeff,
+    Description -> "Whether to evolve A in time",
+    Default -> 0
+  },
+  {
+    Name -> harmonicF,
+    Description -> "d/dt alpha = - f alpha^n K   (harmonic=1, 1+log=2)",
+    Default -> 1
+  },
+  {
+    Name -> AlphaDriver,
+    Default -> 0
+  },
+  {
+    Name -> RDriver,
+    Default -> 1
+  },
+  {
+    Name -> ShiftBCoeff,
+    Description -> "Whether to evolve B^i in time",
+    Default -> 1
+  },
+  {
+    Name -> ShiftGammaCoeff,
+    Default -> 0
+  },
+  {
+    Name -> BetaDriver,
+    Default -> 0
+  },
+  {
+    Name -> WFactor,
+    Default -> 1
+  },
+  {
+    Name -> LapseAdvectionCoeff,
+    Description -> "Factor in front of the lapse advection terms in 1+log",
+    Default -> 1
+  },
+  {
+    Name -> ShiftAdvectionCoeff,
+    Description -> "Factor in front of the shift advection terms in gamma driver",
+    Default -> 1
+  },
+  {
+    Name -> MinimumLapse,
+    Description -> "Minimum value of the lapse function",
+    Default -> -1
+  },
+  {
+    Name -> SpatialBetaDriverRadius,
+    Description -> "Radius at which the BetaDriver starts to be reduced",
+    AllowedValues -> {{Value -> "(0:*", Description -> "Positive"}},
+    Default -> 10^12
+  },
+  {
+    Name -> SpatialShiftGammaCoeffRadius,
+    Description -> "Radius at which the ShiftGammaCoefficient starts to be reduced",
+    AllowedValues -> {{Value -> "(0:*", Description -> "Positive"}},
+    Default -> 10^12
+  },
+  {
+    Name -> EpsDiss,
+    Description -> "Dissipation strength",
+    AllowedValues -> {{Value -> "(0:*", Description -> "Positive"}},
+    Default -> 0
+  },
+  {
+    Name -> KEvolFactor,
+    Description -> "",
+    AllowedValues -> {{Value -> "(0:*", Description -> "Positive"}},
+    Default -> 0
+  }
+};
+
+(******************************************************************************)
+(* Construct the thorns *)
+(******************************************************************************)
+
+calculations =
+Join[
+{
+  initialCalc,
+  convertFromADMBaseCalc,
+  initGammaCalc,
+  convertFromADMBaseGammaCalc,
+   evolCalc,
+  (*evolCalc1, evolCalc2,*)
+  lapseEvolCalc,
+  dissCalc,
+  advectCalc,
+  initRHSCalc,
+  (* evol1Calc, evol2Calc, *)
+  RHSStaticBoundaryCalc,
+  (* RHSRadiativeBoundaryCalc, *)
+  enforceCalc,
+  boundaryCalc,
+  convertToADMBaseCalc,
+  convertToADMBaseDtLapseShiftCalc,
+  convertToADMBaseDtLapseShiftBoundaryCalc,
+  convertToADMBaseFakeDtLapseShiftCalc,
+  constraintsCalc
+  (*constraintsCalc1, constraintsCalc2*)
+},
+  {} (*dissCalcs*)
+];
+
+CreateKrancThornTT [groups, ".", thorn,
+  Calculations -> calculations,
+  DeclaredGroups -> declaredGroupNames,
+  PartialDerivatives -> derivatives,
+  EvolutionTimelevels -> evolutionTimelevels,
+  DefaultEvolutionTimelevels -> 3,
+  UseJacobian -> True,
+  UseLoopControl -> True,
+  UseVectors -> True,
+  InheritedImplementations -> inheritedImplementations,
+  InheritedKeywordParameters -> inheritedKeywordParameters,
+  ExtendedKeywordParameters -> extendedKeywordParameters,
+  KeywordParameters -> keywordParameters,
+  IntParameters -> intParameters,
+  RealParameters -> realParameters
+];
+
+];
+
+
+
+(******************************************************************************)
+(* Options *)
+(******************************************************************************)
+
+(* These are the arguments to createCode:
+   - derivative order: 2, 4, 6, 8, ...
+   - useJacobian: False or True
+   - split upwind derivatives: False or True
+   - timelevels: 2 or 3
+     (keep this at 3; this is better chosen with a run-time parameter)
+   - matter: 0 or 1
+     (matter seems cheap; it should be always enabled)
+   - thorn base name
+*)
+
+createCode[4, False, True , 3, 1, "MD"];
diff --git a/src/basis.c b/src/basis.c
new file mode 100644
index 0000000..8e5bdcc
--- /dev/null
+++ b/src/basis.c
@@ -0,0 +1,281 @@
+/*
+ * Basis sets for pseudospectral methods
+ * Copyright (C) 2016 Anton Khirnov <anton@khirnov.net>
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <errno.h>
+#include <math.h>
+
+#include "basis.h"
+#include "common.h"
+
+typedef struct BasisSet {
+    /* evaluate the idx-th basis function at the specified point*/
+    double (*eval)      (const MDBasisSetContext *s, double coord, unsigned int idx);
+    /* evaluate the first derivative of the idx-th basis function at the specified point*/
+    double (*eval_diff1)(const MDBasisSetContext *s, double coord, unsigned int idx);
+    /* evaluate the second derivative of the idx-th basis function at the specified point*/
+    double (*eval_diff2)(const MDBasisSetContext *s, double coord, unsigned int idx);
+    /**
+     * Get the idx-th collocation point for the specified order.
+     * idx runs from 0 to order - 1 (inclusive)
+     */
+    double (*colloc_point)(const MDBasisSetContext *s, unsigned int order, unsigned int idx);
+} BasisSet;
+
+struct MDBasisSetContext {
+    const BasisSet *bs;
+    double sf;
+};
+
+/*
+ * The basis of even (n = 2 * idx) SB functions (Boyd 2000, Ch 17.9)
+ * SB(x, n) = sin((n + 1) arccot(|x| / L))
+ * They are symmetric wrt origin and decay as 1/x in infinity.
+ */
+static double sb_even_eval(const MDBasisSetContext *s, double coord, unsigned int idx)
+{
+    double val = atan2(s->sf, coord);
+
+    idx *= 2;   // even only
+
+    return sin((idx + 1) * val);
+}
+
+static double sb_even_eval_diff1(const MDBasisSetContext *s, double coord, unsigned int idx)
+{
+    double val = atan2(s->sf, coord);
+
+    idx *= 2;   // even only
+
+    return -s->sf * (idx + 1) * cos((idx + 1) * val) / (SQR(s->sf) + SQR(coord));
+}
+
+static double sb_even_eval_diff2(const MDBasisSetContext *s, double coord, unsigned int idx)
+{
+    const double sf = s->sf;
+    double val = atan2(sf, coord);
+
+    idx *= 2;   // even only
+
+    return sf * (idx + 1) * (2 * coord * cos((idx + 1) * val) - sf * (idx + 1) * sin((idx + 1) * val)) / SQR(SQR(sf) + SQR(coord));
+}
+
+static double sb_even_colloc_point(const MDBasisSetContext *s, unsigned int order, unsigned int idx)
+{
+    double t;
+
+    idx = order - idx - 1;
+    //order *= 2;
+
+    //t = (idx + 2) * M_PI / (order + 4);
+#if MD_POLAR
+    t = (idx + 2) * M_PI / (2 * order + 3);
+#else
+    t = (idx + 2) * M_PI / (2 * order + 2);
+#endif
+    return s->sf / tan(t);
+}
+
+static const BasisSet sb_even_basis = {
+    .eval         = sb_even_eval,
+    .eval_diff1   = sb_even_eval_diff1,
+    .eval_diff2   = sb_even_eval_diff2,
+    .colloc_point = sb_even_colloc_point,
+};
+
+static double sb_odd_eval(const MDBasisSetContext *s, double coord, unsigned int idx)
+{
+    double val = atan2(s->sf, coord);
+
+    idx = 2 * idx + 2;   // odd only
+
+    return sin((idx) * val);
+}
+
+static double sb_odd_eval_diff1(const MDBasisSetContext *s, double coord, unsigned int idx)
+{
+    double val = atan2(s->sf, coord);
+
+    idx = 2 * idx + 2;   // odd only
+
+    return -s->sf * (idx) * cos((idx) * val) / (SQR(s->sf) + SQR(coord));
+}
+
+static double sb_odd_eval_diff2(const MDBasisSetContext *s, double coord, unsigned int idx)
+{
+    const double sf = s->sf;
+    double val = atan2(sf, coord);
+
+    idx = 2 * idx + 2;   // odd only
+
+    return sf * (idx) * (2 * coord * cos((idx) * val) - sf * (idx) * sin((idx) * val)) / SQR(SQR(sf) + SQR(coord));
+}
+
+static double sb_odd_colloc_point(const MDBasisSetContext *s, unsigned int order, unsigned int idx)
+{
+    double t;
+
+    idx = order - idx - 1;
+    //order *= 2;
+
+    //t = (idx + 2) * M_PI / (order + 4);
+#if MD_POLAR
+    t = (idx + 2) * M_PI / (2 * order + 3);
+#else
+    t = (idx + 2) * M_PI / (2 * order + 3);
+#endif
+    return s->sf / tan(t);
+}
+
+static const BasisSet sb_odd_basis = {
+    .eval         = sb_odd_eval,
+    .eval_diff1   = sb_odd_eval_diff1,
+    .eval_diff2   = sb_odd_eval_diff2,
+    .colloc_point = sb_odd_colloc_point,
+};
+
+static double tb_even_eval(const MDBasisSetContext *s, double coord, unsigned int idx)
+{
+    double val = (coord == 0.0) ? M_PI_2 : atan(s->sf / fabs(coord));
+
+    idx++;
+    idx *= 2;   // even only
+
+    return cos(idx * val) - 1.0;
+}
+
+static double tb_even_eval_diff1(const MDBasisSetContext *s, double coord, unsigned int idx)
+{
+    double val = (coord == 0.0) ? M_PI_2 : atan(s->sf / fabs(coord));
+
+    idx++;
+    idx *= 2;   // even only
+
+    return s->sf * idx * SGN(coord) * sin(idx * val) / (SQR(s->sf) + SQR(coord));
+}
+
+static double tb_even_eval_diff2(const MDBasisSetContext *s, double coord, unsigned int idx)
+{
+    const double sf = s->sf;
+    double val = (coord == 0.0) ? M_PI_2 : atan(sf / fabs(coord));
+
+    idx++;
+    idx *= 2;   // even only
+
+    return -sf * idx * SGN(coord) * (2 * fabs(coord) * sin(idx * val) + sf * idx * cos(idx * val)) / SQR(SQR(sf) + SQR(coord));
+}
+
+static double tb_even_colloc_point(const MDBasisSetContext *s, unsigned int order, unsigned int idx)
+{
+    double t;
+
+    idx = order - idx - 1;
+    //order *= 2;
+
+    //t = (idx + 2) * M_PI / (order + 4);
+    t = (idx + 2) * M_PI / (2 * order + 4);
+    return s->sf / tan(t);
+}
+
+static const BasisSet tb_even_basis = {
+    .eval         = tb_even_eval,
+    .eval_diff1   = tb_even_eval_diff1,
+    .eval_diff2   = tb_even_eval_diff2,
+    .colloc_point = tb_even_colloc_point,
+};
+
+static double cos_even_eval(const MDBasisSetContext *s, double coord, unsigned int idx)
+{
+    return cos(2 * idx * coord);
+}
+
+static double cos_even_eval_diff1(const MDBasisSetContext *s, double coord, unsigned int idx)
+{
+    return -2 * idx * sin(2 * idx * coord);
+}
+
+static double cos_even_eval_diff2(const MDBasisSetContext *s, double coord, unsigned int idx)
+{
+    return -4 * SQR(idx) * cos(2 * idx * coord);
+}
+
+static double cos_even_colloc_point(const MDBasisSetContext *s, unsigned int order, unsigned int idx)
+{
+    return M_PI * idx / (2 * order - 0);
+}
+
+static const BasisSet cos_even_basis = {
+    .eval         = cos_even_eval,
+    .eval_diff1   = cos_even_eval_diff1,
+    .eval_diff2   = cos_even_eval_diff2,
+    .colloc_point = cos_even_colloc_point,
+};
+
+double md_basis_eval(const MDBasisSetContext *s, enum MDBasisEvalType type,
+                     double coord, unsigned int order)
+{
+    double (*eval)(const MDBasisSetContext *, double, unsigned int) = NULL;
+
+    switch (type) {
+    case MD_BASIS_EVAL_TYPE_VALUE: eval = s->bs->eval;       break;
+    case MD_BASIS_EVAL_TYPE_DIFF1: eval = s->bs->eval_diff1; break;
+    case MD_BASIS_EVAL_TYPE_DIFF2: eval = s->bs->eval_diff2; break;
+    }
+
+    return eval(s, coord, order);
+}
+
+double md_basis_colloc_point(const MDBasisSetContext *s, unsigned int order,
+                             unsigned int idx)
+{
+    return s->bs->colloc_point(s, order, idx);
+}
+
+void md_basis_free(MDBasisSetContext **pctx)
+{
+    MDBasisSetContext *ctx = *pctx;
+
+    if (!ctx)
+        return;
+
+    free(ctx);
+    *pctx = NULL;
+}
+
+int md_basis_init(MDBasisSetContext **pctx, enum MDBasisFamily family, double sf)
+{
+    MDBasisSetContext *ctx;
+
+    ctx = calloc(1, sizeof(*ctx));
+    if (!ctx)
+        return -ENOMEM;
+
+    switch (family) {
+    case MD_BASIS_FAMILY_TB_EVEN:   ctx->bs = &tb_even_basis;   break;
+    case MD_BASIS_FAMILY_SB_EVEN:   ctx->bs = &sb_even_basis;   break;
+    case MD_BASIS_FAMILY_SB_ODD:    ctx->bs = &sb_odd_basis;    break;
+    case MD_BASIS_FAMILY_COS_EVEN:  ctx->bs = &cos_even_basis;  break;
+    default:
+        free(ctx);
+        return -EINVAL;        
+    }
+
+    ctx->sf = sf;
+
+    *pctx = ctx;
+    return 0;
+}
diff --git a/src/basis.h b/src/basis.h
new file mode 100644
index 0000000..08f23ee
--- /dev/null
+++ b/src/basis.h
@@ -0,0 +1,45 @@
+/*
+ * Basis sets for pseudospectral methods
+ * Copyright (C) 2016 Anton Khirnov <anton@khirnov.net>
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef MD_BASIS_H
+#define MD_BASIS_H
+
+enum MDBasisEvalType {
+    MD_BASIS_EVAL_TYPE_VALUE,
+    MD_BASIS_EVAL_TYPE_DIFF1,
+    MD_BASIS_EVAL_TYPE_DIFF2,
+};
+
+enum MDBasisFamily {
+    MD_BASIS_FAMILY_TB_EVEN,
+    MD_BASIS_FAMILY_SB_EVEN,
+    MD_BASIS_FAMILY_SB_ODD,
+    MD_BASIS_FAMILY_COS_EVEN,
+};
+
+typedef struct MDBasisSetContext MDBasisSetContext;
+
+int md_basis_init(MDBasisSetContext **ctx, enum MDBasisFamily family, double sf);
+void md_basis_free(MDBasisSetContext **ctx);
+
+double md_basis_eval(const MDBasisSetContext *ctx, enum MDBasisEvalType type,
+                     double coord, unsigned int order);
+double md_basis_colloc_point(const MDBasisSetContext *ctx, unsigned int order,
+                             unsigned int idx);
+
+#endif /* MD_BASIS_H */
diff --git a/src/bicgstab.c b/src/bicgstab.c
new file mode 100644
index 0000000..7e82183
--- /dev/null
+++ b/src/bicgstab.c
@@ -0,0 +1,410 @@
+/*
+ * BiCGStab iterative linear system solver
+ * Copyright (C) 2016 Anton Khirnov <anton@khirnov.net>
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "common.h"
+
+#if HAVE_OPENCL
+#include <cl.h>
+#include <clBLAS.h>
+#endif
+
+#include <cblas.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "bicgstab.h"
+
+#define BICGSTAB_MAXITER 16
+#define BICGSTAB_TOL (1e-15)
+
+struct BiCGStabContext {
+    int N;
+
+    double *x;
+    double *p, *v, *y, *z, *t;
+    double *res, *res0;
+    double *k;
+
+#if HAVE_OPENCL
+    cl_context       ocl_ctx;
+    cl_command_queue ocl_queue;
+
+    cl_mem cl_x;
+    cl_mem cl_p, cl_v, cl_y, cl_z, cl_t;
+    cl_mem cl_res, cl_res0;
+    cl_mem cl_k, cl_mat;
+    cl_mem cl_rho, cl_alpha, cl_beta, cl_omega, cl_omega1;
+    cl_mem cl_tmp, cl_tmp1;
+#endif
+};
+
+#if HAVE_OPENCL
+static int solve_cl(BiCGStabContext *ctx,
+                    const double *mat, const double *rhs, double *x)
+{
+    cl_command_queue ocl_q = ctx->ocl_queue;
+    const int            N = ctx->N;
+    const double rhs_norm = cblas_dnrm2(N, rhs, 1);
+
+    double rho, rho_prev = 1.0;
+    double omega[2] = { 1.0 };
+    double alpha = 1.0;
+
+    double err;
+    int i;
+
+    cl_event events[8];
+
+    // upload the matrix and RHS
+    clEnqueueWriteBuffer(ocl_q, ctx->cl_res, 0, 0, N * sizeof(double),     rhs, 0, NULL, &events[0]);
+    clEnqueueWriteBuffer(ocl_q, ctx->cl_mat, 0, 0, N * N * sizeof(double), mat, 0, NULL, &events[1]);
+
+    // initialize the residual
+    clblasDgemv(CblasColMajor, CblasNoTrans, N, N, -1.0,
+                ctx->cl_mat, 0, N, ctx->cl_x, 0, 1, 1.0, ctx->cl_res, 0, 1,
+                1, &ocl_q, 2, events, &events[2]);
+    clEnqueueCopyBuffer(ocl_q, ctx->cl_res, ctx->cl_res0, 0, 0, N * sizeof(double),
+                        1, &events[2], &events[3]);
+    clEnqueueCopyBuffer(ocl_q, ctx->cl_res, ctx->cl_p, 0, 0, N * sizeof(double),
+                        1, &events[2], &events[4]);
+
+    clWaitForEvents(5, events);
+    // BARRIER
+
+    for (i = 0; i < MAXITER; i++) {
+        clblasDdot(N, ctx->cl_rho, 0, ctx->cl_res, 0, 1, ctx->cl_res0, 0, 1,
+                   ctx->cl_tmp, 1, &ocl_q, 0, NULL, &events[0]);
+        clEnqueueReadBuffer(ocl_q, ctx->cl_rho, 1, 0, sizeof(double), &rho,
+                            1, &events[0], NULL);
+        // BARRIER
+
+        if (i) {
+            double beta = (rho / rho_prev) * (alpha / omega[0]);
+
+            clblasDaxpy(N, -omega[0], ctx->cl_v, 0, 1, ctx->cl_p, 0, 1,
+                        1, &ocl_q, 0, NULL, &events[0]);
+            clblasDscal(N, beta, ctx->cl_p, 0, 1,
+                        1, &ocl_q, 1, &events[0], &events[1]);
+            clblasDaxpy(N, 1, ctx->cl_res, 0, 1, ctx->cl_p, 0, 1,
+                        1, &ocl_q, 1, &events[1], &events[0]);
+            clWaitForEvents(1, &events[0]);
+            // BARRIER
+        }
+
+        clblasDgemv(CblasColMajor, CblasNoTrans, N, N, 1.0,
+                    ctx->cl_k, 0, N, ctx->cl_p, 0, 1, 0.0, ctx->cl_y, 0, 1,
+                    1, &ocl_q, 0, NULL, &events[0]);
+
+        clblasDgemv(CblasColMajor, CblasNoTrans, N, N, 1.0,
+                    ctx->cl_mat, 0, N, ctx->cl_y, 0, 1, 0.0, ctx->cl_v, 0, 1,
+                    1, &ocl_q, 1, &events[0], &events[1]);
+
+        clblasDdot(N, ctx->cl_alpha, 0, ctx->cl_res0, 0, 1, ctx->cl_v, 0, 1,
+                   ctx->cl_tmp, 1, &ocl_q, 1, &events[1], &events[0]);
+        clEnqueueReadBuffer(ocl_q, ctx->cl_alpha, 1, 0, sizeof(double), &alpha,
+                            1, &events[0], NULL);
+        // BARRIER
+
+        alpha = rho / alpha;
+
+        clblasDaxpy(N, -alpha, ctx->cl_v, 0, 1, ctx->cl_res, 0, 1,
+                    1, &ocl_q, 0, NULL, &events[0]);
+
+        clblasDgemv(CblasColMajor, CblasNoTrans, N, N, 1.0,
+                    ctx->cl_k, 0, N, ctx->cl_res, 0, 1, 0.0, ctx->cl_z, 0, 1,
+                    1, &ocl_q, 1, &events[0], &events[1]);
+        clblasDgemv(CblasColMajor, CblasNoTrans, N, N, 1.0,
+                    ctx->cl_mat, 0, N, ctx->cl_z, 0, 1, 0.0, ctx->cl_t, 0, 1,
+                    1, &ocl_q, 1, &events[1], &events[0]);
+
+        clblasDdot(N, ctx->cl_omega, 0, ctx->cl_t, 0, 1, ctx->cl_res, 0, 1,
+                   ctx->cl_tmp, 1, &ocl_q, 1, &events[0], &events[1]);
+        clblasDdot(N, ctx->cl_omega, 1, ctx->cl_t, 0, 1, ctx->cl_t, 0, 1,
+                   ctx->cl_tmp1, 1, &ocl_q, 1, &events[0], &events[2]);
+
+        clEnqueueReadBuffer(ocl_q, ctx->cl_omega, 1, 0, sizeof(omega), omega,
+                            2, &events[1], NULL);
+        // BARRIER
+
+        omega[0] /= omega[1];
+
+        clblasDaxpy(N, alpha, ctx->cl_y, 0, 1, ctx->cl_x, 0, 1,
+                    1, &ocl_q, 0, NULL, &events[0]);
+        clblasDaxpy(N, omega[0], ctx->cl_z, 0, 1, ctx->cl_x, 0, 1,
+                    1, &ocl_q, 1, &events[0], &events[1]);
+
+        clblasDaxpy(N, -omega[0], ctx->cl_t, 0, 1, ctx->cl_res, 0, 1,
+                    1, &ocl_q, 0, NULL, &events[0]);
+        clblasDnrm2(N, ctx->cl_tmp, 0, ctx->cl_res, 0, 1, ctx->cl_tmp1,
+                    1, &ocl_q, 1, &events[0], &events[2]);
+        clEnqueueReadBuffer(ocl_q, ctx->cl_tmp, 1, 0, sizeof(double), &err,
+                            1, &events[2], NULL);
+        clWaitForEvents(1, &events[1]);
+        // BARRIER
+
+        if (err < BICGSTAB_TOL)
+            break;
+
+        rho_prev = rho;
+    }
+    if (i == BICGSTAB_MAXITER)
+        return -1;
+
+    clEnqueueReadBuffer(ocl_q, ctx->cl_x, 1, 0, sizeof(double) * N,
+                        x, 0, NULL, NULL);
+    return i;
+}
+#endif
+
+// based on the wikipedia article
+// and http://www.netlib.org/templates/matlab/bicgstab.m
+static int solve_sw(BiCGStabContext *ctx,
+                    const double *mat, const double *rhs, double *x)
+{
+    const int N = ctx->N;
+    const double rhs_norm = cblas_dnrm2(N, rhs, 1);
+
+    double rho, rho_prev = 1.0;
+    double omega = 1.0;
+    double alpha = 1.0;
+
+    double err;
+    int i;
+
+    double *k = ctx->k;
+    double *p = ctx->p, *v = ctx->v, *y = ctx->y, *z = ctx->z, *t = ctx->t;
+    double *res = ctx->res, *res0 = ctx->res0;
+
+    // initialize the residual
+    memcpy(res, rhs, N * sizeof(*res));
+    cblas_dgemv(CblasColMajor, CblasNoTrans, N, N, -1.0,
+                mat, N, ctx->x, 1, 1.0, res, 1);
+
+    memcpy(res0, res, N * sizeof(*res0));
+    memcpy(p,    res, N * sizeof(*p));
+
+    for (i = 0; i < BICGSTAB_MAXITER; i++) {
+        rho = cblas_ddot(N, res, 1, res0, 1);
+
+        if (i) {
+            double beta = (rho / rho_prev) * (alpha / omega);
+
+            cblas_daxpy(N, -omega, v, 1, p, 1);
+            cblas_dscal(N, beta, p, 1);
+            cblas_daxpy(N, 1, res, 1, p, 1);
+        }
+
+        cblas_dgemv(CblasColMajor, CblasNoTrans, N, N, 1.0,
+                    k, N, p, 1, 0.0, y, 1);
+
+        cblas_dgemv(CblasColMajor, CblasNoTrans, N, N, 1.0,
+                    mat, N, y, 1, 0.0, v, 1);
+
+        alpha = rho / cblas_ddot(N, res0, 1, v, 1);
+
+        cblas_daxpy(N, -alpha, v, 1, res, 1);
+
+        cblas_dgemv(CblasColMajor, CblasNoTrans, N, N, 1.0,
+                    k, N, res, 1, 0.0, z, 1);
+        cblas_dgemv(CblasColMajor, CblasNoTrans, N, N, 1.0,
+                    mat, N, z, 1, 0.0, t, 1);
+
+        omega = cblas_ddot(N, t, 1, res, 1) / cblas_ddot(N, t, 1, t, 1);
+
+        cblas_daxpy(N, alpha, y, 1, ctx->x, 1);
+        cblas_daxpy(N, omega, z, 1, ctx->x, 1);
+
+        cblas_daxpy(N, -omega, t, 1, res, 1);
+
+        err = cblas_dnrm2(N, res, 1) / rhs_norm;
+        if (err < BICGSTAB_TOL)
+            break;
+
+        rho_prev = rho;
+    }
+    if (i == BICGSTAB_MAXITER)
+        return -1;
+
+    memcpy(x, ctx->x, sizeof(*x) * ctx->N);
+
+    return i;
+}
+
+int md_bicgstab_solve(BiCGStabContext *ctx, const double *mat, const double *rhs, double *x)
+{
+    int ret;
+
+#if HAVE_OPENCL
+    if (ctx->ocl_ctx)
+        ret = solve_cl(ctx, mat, rhs, x);
+    else
+#endif
+        ret = solve_sw(ctx, mat, rhs, x);
+    if (ret < 0)
+        return ret;
+
+#if MD_VERIFY
+    {
+        int i;
+        double *y;
+
+        y = malloc(sizeof(*y) * ctx->N);
+        memcpy(y, rhs, sizeof(*y) * ctx->N);
+        cblas_dgemv(CblasColMajor, CblasNoTrans, ctx->N, ctx->N, -1.0,
+                    mat, ctx->N, x, 1, 1.0, y, 1);
+        i = cblas_idamax(ctx->N, y, 1);
+        if (fabs(y[i]) > 1e-11)
+            abort();
+    }
+#endif
+
+    return ret;
+}
+
+int md_bicgstab_init(BiCGStabContext *ctx, const double *k, const double *x0)
+{
+#if HAVE_OPENCL
+    if (ctx->ocl_ctx) {
+        cl_event events[2];
+        clEnqueueWriteBuffer(ctx->ocl_queue, ctx->cl_k, 0, 0, ctx->N * ctx->N * sizeof(double),
+                             k, 0, NULL, &events[0]);
+        clEnqueueWriteBuffer(ctx->ocl_queue, ctx->cl_x, 0, 0, ctx->N * sizeof(double),
+                             x0, 0, NULL, &events[1]);
+        clWaitForEvents(2, events);
+    } else
+#endif
+    {
+        memcpy(ctx->x, x0, ctx->N * sizeof(*x0));
+        memcpy(ctx->k, k,  ctx->N * ctx->N * sizeof(*k));
+    }
+
+    return 0;
+}
+
+int md_bicgstab_context_alloc(BiCGStabContext **pctx, int N,
+                               cl_context ocl_ctx, cl_command_queue ocl_q)
+{
+    BiCGStabContext *ctx;
+    int ret = 0;
+
+    ctx = calloc(1, sizeof(*ctx));
+    if (!ctx)
+        return -ENOMEM;
+
+    ctx->N = N;
+
+#if HAVE_OPENCL
+    if (ocl_ctx) {
+        ctx->ocl_ctx   = ocl_ctx;
+        ctx->ocl_queue = ocl_q;
+
+#define ALLOC(dst, size)                                        \
+do {                                                            \
+    ctx->dst = clCreateBuffer(ocl_ctx, 0, size, NULL, &ret);    \
+    if (ret != CL_SUCCESS)                                      \
+        goto fail;                                              \
+} while (0)
+
+        ALLOC(cl_x,    N * sizeof(double));
+        ALLOC(cl_p,    N * sizeof(double));
+        ALLOC(cl_v,    N * sizeof(double));
+        ALLOC(cl_y,    N * sizeof(double));
+        ALLOC(cl_z,    N * sizeof(double));
+        ALLOC(cl_t,    N * sizeof(double));
+        ALLOC(cl_res,  N * sizeof(double));
+        ALLOC(cl_res0, N * sizeof(double));
+        ALLOC(cl_tmp,  N * sizeof(double));
+        ALLOC(cl_tmp1, N * 2 * sizeof(double));
+
+        ALLOC(cl_k,   N * N * sizeof(double));
+        ALLOC(cl_mat, N * N * sizeof(double));
+
+        ALLOC(cl_rho,    sizeof(double));
+        ALLOC(cl_alpha,  sizeof(double));
+        ALLOC(cl_beta,   sizeof(double));
+        ALLOC(cl_omega,  2 * sizeof(double));
+        ALLOC(cl_omega1, sizeof(double));
+    } else
+#endif
+    {
+        ret |= posix_memalign((void**)&ctx->x,    32, sizeof(double) * N);
+        ret |= posix_memalign((void**)&ctx->p,    32, sizeof(double) * N);
+        ret |= posix_memalign((void**)&ctx->v,    32, sizeof(double) * N);
+        ret |= posix_memalign((void**)&ctx->y,    32, sizeof(double) * N);
+        ret |= posix_memalign((void**)&ctx->z,    32, sizeof(double) * N);
+        ret |= posix_memalign((void**)&ctx->t,    32, sizeof(double) * N);
+        ret |= posix_memalign((void**)&ctx->res,  32, sizeof(double) * N);
+        ret |= posix_memalign((void**)&ctx->res0, 32, sizeof(double) * N);
+        ret |= posix_memalign((void**)&ctx->k,    32, sizeof(double) * N * N);
+    }
+
+fail:
+    if (ret) {
+        md_bicgstab_context_free(&ctx);
+        return -ENOMEM;
+    }
+
+    *pctx = ctx;
+    return 0;
+}
+
+void md_bicgstab_context_free(BiCGStabContext **pctx)
+{
+    BiCGStabContext *ctx = *pctx;
+
+    if (!ctx)
+        return;
+
+    free(ctx->x);
+    free(ctx->p);
+    free(ctx->v);
+    free(ctx->y);
+    free(ctx->z);
+    free(ctx->t);
+    free(ctx->res);
+    free(ctx->res0);
+    free(ctx->k);
+
+#if HAVE_OPENCL
+    if (ctx->ocl_ctx) {
+        clReleaseMemObject(ctx->cl_x);
+        clReleaseMemObject(ctx->cl_p);
+        clReleaseMemObject(ctx->cl_v);
+        clReleaseMemObject(ctx->cl_y);
+        clReleaseMemObject(ctx->cl_z);
+        clReleaseMemObject(ctx->cl_t);
+        clReleaseMemObject(ctx->cl_res);
+        clReleaseMemObject(ctx->cl_res0);
+        clReleaseMemObject(ctx->cl_tmp);
+        clReleaseMemObject(ctx->cl_tmp1);
+
+        clReleaseMemObject(ctx->cl_k);
+        clReleaseMemObject(ctx->cl_mat);
+
+        clReleaseMemObject(ctx->cl_rho);
+        clReleaseMemObject(ctx->cl_alpha);
+        clReleaseMemObject(ctx->cl_beta);
+        clReleaseMemObject(ctx->cl_omega);
+        clReleaseMemObject(ctx->cl_omega1);
+    }
+#endif
+
+    free(ctx);
+    *pctx = NULL;
+}
diff --git a/src/bicgstab.h b/src/bicgstab.h
new file mode 100644
index 0000000..70624f4
--- /dev/null
+++ b/src/bicgstab.h
@@ -0,0 +1,60 @@
+/*
+ * BiCGStab iterative linear system solver
+ * Copyright (C) 2016 Anton Khirnov <anton@khirnov.net>
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef MD_BICGSTAB_H
+#define MD_BICGSTAB_H
+
+#include "common.h"
+
+#if HAVE_OPENCL
+#include <cl.h>
+#else
+typedef void* cl_context;
+typedef void* cl_command_queue;
+#endif
+
+typedef struct BiCGStabContext BiCGStabContext;
+
+/**
+ * Allocate and initialize the solver for the NxN system.
+ *
+ * If the OpenCL context and command queue are provided (non-NULL), the solver
+ * will run using clBLAS.
+ */
+int md_bicgstab_context_alloc(BiCGStabContext **ctx, int N,
+                              cl_context ocl_ctx, cl_command_queue ocl_q);
+
+/**
+ * Free the solver and all its internal state.
+ */
+void md_bicgstab_context_free(BiCGStabContext **ctx);
+
+/**
+ * Initialise the solver with the given preconditioner matrix. This function
+ * may be any number of times on a given solver context.
+ */
+int md_bicgstab_init(BiCGStabContext *ctx, const double *k, const double *x0);
+
+/**
+ * Solve the linear system
+ * mat · x = rhs
+ * The result is written into x.
+ */
+int md_bicgstab_solve(BiCGStabContext *ctx, const double *mat, const double *rhs, double *x);
+
+#endif /* MD_BICGSTAB_H */
diff --git a/src/common.h b/src/common.h
new file mode 100644
index 0000000..2b1ebf6
--- /dev/null
+++ b/src/common.h
@@ -0,0 +1,29 @@
+#ifndef MD_COMMON_H
+#define MD_COMMON_H
+
+#define HAVE_OPENCL 0
+#define MD_VERIFY  0
+#define MD_POLAR   0
+
+#define SQR(x) ((x) * (x))
+#define SGN(x) ((x) >= 0.0 ? 1.0 : -1.0)
+#define MAX(x, y) ((x) > (y) ? (x) : (y))
+#define MIN(x, y) ((x) > (y) ? (y) : (x))
+#define ARRAY_ELEMS(arr) (sizeof(arr) / sizeof(*arr))
+
+/*
+ * small number to avoid r=0 singularities
+ */
+#define EPS 1E-08
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <sys/time.h>
+static inline int64_t gettime(void)
+{
+    struct timeval tv;
+    gettimeofday(&tv, NULL);
+    return (int64_t)tv.tv_sec * 1000000 + tv.tv_usec;
+}
+
+#endif /* MD_COMMON_H */
diff --git a/src/config.asm b/src/config.asm
new file mode 100644
index 0000000..0ee0ca2
--- /dev/null
+++ b/src/config.asm
@@ -0,0 +1,1325 @@
+%define ARCH_AARCH64 0
+%define ARCH_ALPHA 0
+%define ARCH_ARM 0
+%define ARCH_AVR32 0
+%define ARCH_AVR32_AP 0
+%define ARCH_AVR32_UC 0
+%define ARCH_BFIN 0
+%define ARCH_IA64 0
+%define ARCH_M68K 0
+%define ARCH_MIPS 0
+%define ARCH_MIPS64 0
+%define ARCH_PARISC 0
+%define ARCH_PPC 0
+%define ARCH_PPC64 0
+%define ARCH_S390 0
+%define ARCH_SH4 0
+%define ARCH_SPARC 0
+%define ARCH_SPARC64 0
+%define ARCH_TILEGX 0
+%define ARCH_TILEPRO 0
+%define ARCH_TOMI 0
+%define ARCH_X86 1
+%define ARCH_X86_32 0
+%define ARCH_X86_64 1
+%define HAVE_ARMV5TE 0
+%define HAVE_ARMV6 0
+%define HAVE_ARMV6T2 0
+%define HAVE_ARMV8 0
+%define HAVE_NEON 0
+%define HAVE_VFP 0
+%define HAVE_VFPV3 0
+%define HAVE_ALTIVEC 0
+%define HAVE_DCBZL 1
+%define HAVE_LDBRX 1
+%define HAVE_PPC4XX 0
+%define HAVE_AMD3DNOW 1
+%define HAVE_AMD3DNOWEXT 1
+%define HAVE_AVX 1
+%define HAVE_AVX2 1
+%define HAVE_FMA3 1
+%define HAVE_FMA4 1
+%define HAVE_MMX 1
+%define HAVE_MMXEXT 1
+%define HAVE_SSE 1
+%define HAVE_SSE2 1
+%define HAVE_SSE3 1
+%define HAVE_SSE4 1
+%define HAVE_SSE42 1
+%define HAVE_SSSE3 1
+%define HAVE_XOP 1
+%define HAVE_CPUNOP 1
+%define HAVE_I686 1
+%define HAVE_LOONGSON 1
+%define HAVE_VIS 1
+%define HAVE_ARMV5TE_EXTERNAL 0
+%define HAVE_ARMV6_EXTERNAL 0
+%define HAVE_ARMV6T2_EXTERNAL 0
+%define HAVE_ARMV8_EXTERNAL 0
+%define HAVE_NEON_EXTERNAL 0
+%define HAVE_VFP_EXTERNAL 0
+%define HAVE_VFPV3_EXTERNAL 0
+%define HAVE_ALTIVEC_EXTERNAL 0
+%define HAVE_DCBZL_EXTERNAL 0
+%define HAVE_LDBRX_EXTERNAL 0
+%define HAVE_PPC4XX_EXTERNAL 0
+%define HAVE_AMD3DNOW_EXTERNAL 1
+%define HAVE_AMD3DNOWEXT_EXTERNAL 1
+%define HAVE_AVX_EXTERNAL 1
+%define HAVE_AVX2_EXTERNAL 1
+%define HAVE_FMA3_EXTERNAL 1
+%define HAVE_FMA4_EXTERNAL 1
+%define HAVE_MMX_EXTERNAL 1
+%define HAVE_MMXEXT_EXTERNAL 1
+%define HAVE_SSE_EXTERNAL 1
+%define HAVE_SSE2_EXTERNAL 1
+%define HAVE_SSE3_EXTERNAL 1
+%define HAVE_SSE4_EXTERNAL 1
+%define HAVE_SSE42_EXTERNAL 1
+%define HAVE_SSSE3_EXTERNAL 1
+%define HAVE_XOP_EXTERNAL 1
+%define HAVE_CPUNOP_EXTERNAL 0
+%define HAVE_I686_EXTERNAL 0
+%define HAVE_LOONGSON_EXTERNAL 0
+%define HAVE_VIS_EXTERNAL 0
+%define HAVE_ARMV5TE_INLINE 0
+%define HAVE_ARMV6_INLINE 0
+%define HAVE_ARMV6T2_INLINE 0
+%define HAVE_ARMV8_INLINE 0
+%define HAVE_NEON_INLINE 0
+%define HAVE_VFP_INLINE 0
+%define HAVE_VFPV3_INLINE 0
+%define HAVE_ALTIVEC_INLINE 0
+%define HAVE_DCBZL_INLINE 0
+%define HAVE_LDBRX_INLINE 0
+%define HAVE_PPC4XX_INLINE 0
+%define HAVE_AMD3DNOW_INLINE 1
+%define HAVE_AMD3DNOWEXT_INLINE 1
+%define HAVE_AVX_INLINE 1
+%define HAVE_AVX2_INLINE 1
+%define HAVE_FMA3_INLINE 1
+%define HAVE_FMA4_INLINE 1
+%define HAVE_MMX_INLINE 1
+%define HAVE_MMXEXT_INLINE 1
+%define HAVE_SSE_INLINE 1
+%define HAVE_SSE2_INLINE 1
+%define HAVE_SSE3_INLINE 1
+%define HAVE_SSE4_INLINE 1
+%define HAVE_SSE42_INLINE 1
+%define HAVE_SSSE3_INLINE 1
+%define HAVE_XOP_INLINE 1
+%define HAVE_CPUNOP_INLINE 0
+%define HAVE_I686_INLINE 0
+%define HAVE_LOONGSON_INLINE 0
+%define HAVE_VIS_INLINE 0
+%define HAVE_ALIGNED_STACK 1
+%define HAVE_FAST_64BIT 1
+%define HAVE_FAST_CLZ 1
+%define HAVE_FAST_CMOV 1
+%define HAVE_LOCAL_ALIGNED_8 1
+%define HAVE_LOCAL_ALIGNED_16 1
+%define HAVE_SIMD_ALIGN_16 1
+%define HAVE_ATOMICS_GCC 1
+%define HAVE_ATOMICS_SUNCC 0
+%define HAVE_ATOMICS_WIN32 0
+%define HAVE_ATOMIC_CAS_PTR 0
+%define HAVE_MACHINE_RW_BARRIER 0
+%define HAVE_MEMORYBARRIER 0
+%define HAVE_MM_EMPTY 1
+%define HAVE_RDTSC 0
+%define HAVE_SYNC_VAL_COMPARE_AND_SWAP 1
+%define HAVE_INLINE_ASM 1
+%define HAVE_SYMVER 1
+%define HAVE_YASM 1
+%define HAVE_BIGENDIAN 0
+%define HAVE_FAST_UNALIGNED 1
+%define HAVE_ALSA_ASOUNDLIB_H 1
+%define HAVE_ALTIVEC_H 0
+%define HAVE_ARPA_INET_H 1
+%define HAVE_CDIO_PARANOIA_H 0
+%define HAVE_CDIO_PARANOIA_PARANOIA_H 0
+%define HAVE_DEV_BKTR_IOCTL_BT848_H 0
+%define HAVE_DEV_BKTR_IOCTL_METEOR_H 0
+%define HAVE_DEV_IC_BT8XX_H 0
+%define HAVE_DEV_VIDEO_BKTR_IOCTL_BT848_H 0
+%define HAVE_DEV_VIDEO_METEOR_IOCTL_METEOR_H 0
+%define HAVE_DIRECT_H 0
+%define HAVE_DLFCN_H 1
+%define HAVE_DXVA_H 0
+%define HAVE_GSM_H 0
+%define HAVE_IO_H 0
+%define HAVE_MACH_MACH_TIME_H 0
+%define HAVE_MACHINE_IOCTL_BT848_H 0
+%define HAVE_MACHINE_IOCTL_METEOR_H 0
+%define HAVE_MALLOC_H 1
+%define HAVE_POLL_H 1
+%define HAVE_SNDIO_H 0
+%define HAVE_SOUNDCARD_H 0
+%define HAVE_SYS_MMAN_H 1
+%define HAVE_SYS_PARAM_H 1
+%define HAVE_SYS_RESOURCE_H 1
+%define HAVE_SYS_SELECT_H 1
+%define HAVE_SYS_SOUNDCARD_H 1
+%define HAVE_SYS_TIME_H 1
+%define HAVE_SYS_UN_H 1
+%define HAVE_SYS_VIDEOIO_H 0
+%define HAVE_UNISTD_H 1
+%define HAVE_WINDOWS_H 0
+%define HAVE_WINSOCK2_H 0
+%define HAVE_INTRINSICS_NEON 0
+%define HAVE_ATANF 1
+%define HAVE_ATAN2F 1
+%define HAVE_CBRTF 1
+%define HAVE_COSF 1
+%define HAVE_EXP2 1
+%define HAVE_EXP2F 1
+%define HAVE_EXPF 1
+%define HAVE_ISINF 1
+%define HAVE_ISNAN 1
+%define HAVE_LDEXPF 1
+%define HAVE_LLRINT 1
+%define HAVE_LLRINTF 1
+%define HAVE_LOG2 1
+%define HAVE_LOG2F 1
+%define HAVE_LOG10F 1
+%define HAVE_LRINT 1
+%define HAVE_LRINTF 1
+%define HAVE_POWF 1
+%define HAVE_RINT 1
+%define HAVE_ROUND 1
+%define HAVE_ROUNDF 1
+%define HAVE_SINF 1
+%define HAVE_TRUNC 1
+%define HAVE_TRUNCF 1
+%define HAVE_ALIGNED_MALLOC 0
+%define HAVE_CLOSESOCKET 0
+%define HAVE_COMMANDLINETOARGVW 0
+%define HAVE_COTASKMEMFREE 0
+%define HAVE_CRYPTGENRANDOM 0
+%define HAVE_DLOPEN 1
+%define HAVE_FCNTL 1
+%define HAVE_FLT_LIM 1
+%define HAVE_FORK 1
+%define HAVE_GETADDRINFO 1
+%define HAVE_GETHRTIME 0
+%define HAVE_GETOPT 1
+%define HAVE_GETPROCESSAFFINITYMASK 0
+%define HAVE_GETPROCESSMEMORYINFO 0
+%define HAVE_GETPROCESSTIMES 0
+%define HAVE_GETRUSAGE 1
+%define HAVE_GETSERVBYPORT 1
+%define HAVE_GETSYSTEMTIMEASFILETIME 0
+%define HAVE_GETTIMEOFDAY 1
+%define HAVE_INET_ATON 1
+%define HAVE_ISATTY 1
+%define HAVE_JACK_PORT_GET_LATENCY_RANGE 0
+%define HAVE_LOCALTIME_R 1
+%define HAVE_MACH_ABSOLUTE_TIME 0
+%define HAVE_MAPVIEWOFFILE 0
+%define HAVE_MEMALIGN 1
+%define HAVE_MKSTEMP 1
+%define HAVE_MMAP 1
+%define HAVE_MPROTECT 1
+%define HAVE_NANOSLEEP 1
+%define HAVE_POSIX_MEMALIGN 1
+%define HAVE_SCHED_GETAFFINITY 1
+%define HAVE_SETCONSOLETEXTATTRIBUTE 0
+%define HAVE_SETMODE 0
+%define HAVE_SETRLIMIT 1
+%define HAVE_SLEEP 0
+%define HAVE_STRERROR_R 1
+%define HAVE_STRPTIME 1
+%define HAVE_SYSCONF 1
+%define HAVE_SYSCTL 1
+%define HAVE_USLEEP 1
+%define HAVE_VIRTUALALLOC 0
+%define HAVE_PTHREADS 1
+%define HAVE_W32THREADS 0
+%define HAVE_AS_DN_DIRECTIVE 0
+%define HAVE_AS_FUNC 1
+%define HAVE_ASM_MOD_Q 0
+%define HAVE_ATTRIBUTE_MAY_ALIAS 1
+%define HAVE_ATTRIBUTE_PACKED 1
+%define HAVE_EBP_AVAILABLE 0
+%define HAVE_EBX_AVAILABLE 1
+%define HAVE_GNU_AS 1
+%define HAVE_IBM_ASM 0
+%define HAVE_INLINE_ASM_LABELS 1
+%define HAVE_PRAGMA_DEPRECATED 1
+%define HAVE_SYMVER_ASM_LABEL 0
+%define HAVE_SYMVER_GNU_ASM 1
+%define HAVE_VFP_ARGS 0
+%define HAVE_XFORM_ASM 0
+%define HAVE_XMM_CLOBBERS 1
+%define HAVE_SOCKLEN_T 1
+%define HAVE_STRUCT_ADDRINFO 1
+%define HAVE_STRUCT_GROUP_SOURCE_REQ 1
+%define HAVE_STRUCT_IP_MREQ_SOURCE 1
+%define HAVE_STRUCT_IPV6_MREQ 1
+%define HAVE_STRUCT_POLLFD 1
+%define HAVE_STRUCT_RUSAGE_RU_MAXRSS 1
+%define HAVE_STRUCT_SOCKADDR_IN6 1
+%define HAVE_STRUCT_SOCKADDR_SA_LEN 0
+%define HAVE_STRUCT_SOCKADDR_STORAGE 1
+%define HAVE_STRUCT_V4L2_FRMIVALENUM_DISCRETE 1
+%define HAVE_ATOMICS_NATIVE 1
+%define HAVE_DOS_PATHS 0
+%define HAVE_DXVA2_LIB 0
+%define HAVE_LIBC_MSVCRT 0
+%define HAVE_LIBDC1394_1 0
+%define HAVE_LIBDC1394_2 0
+%define HAVE_SDL 0
+%define HAVE_THREADS 1
+%define HAVE_VDPAU_X11 0
+%define HAVE_XLIB 1
+%define CONFIG_BSFS 1
+%define CONFIG_DECODERS 1
+%define CONFIG_DEMUXERS 1
+%define CONFIG_ENCODERS 1
+%define CONFIG_FILTERS 1
+%define CONFIG_HWACCELS 0
+%define CONFIG_INDEVS 1
+%define CONFIG_MUXERS 1
+%define CONFIG_OUTDEVS 1
+%define CONFIG_PARSERS 1
+%define CONFIG_PROTOCOLS 1
+%define CONFIG_AVCODEC_EXAMPLE 1
+%define CONFIG_FILTER_AUDIO_EXAMPLE 1
+%define CONFIG_METADATA_EXAMPLE 1
+%define CONFIG_OUTPUT_EXAMPLE 1
+%define CONFIG_TRANSCODE_AAC_EXAMPLE 1
+%define CONFIG_AVISYNTH 0
+%define CONFIG_BZLIB 1
+%define CONFIG_FREI0R 0
+%define CONFIG_GNUTLS 0
+%define CONFIG_LIBBS2B 0
+%define CONFIG_LIBCDIO 0
+%define CONFIG_LIBDC1394 0
+%define CONFIG_LIBFAAC 0
+%define CONFIG_LIBFDK_AAC 0
+%define CONFIG_LIBFONTCONFIG 0
+%define CONFIG_LIBFREETYPE 0
+%define CONFIG_LIBGSM 0
+%define CONFIG_LIBILBC 0
+%define CONFIG_LIBMP3LAME 0
+%define CONFIG_LIBOPENCORE_AMRNB 0
+%define CONFIG_LIBOPENCORE_AMRWB 0
+%define CONFIG_LIBOPENCV 0
+%define CONFIG_LIBOPENJPEG 0
+%define CONFIG_LIBOPUS 0
+%define CONFIG_LIBPULSE 0
+%define CONFIG_LIBRTMP 0
+%define CONFIG_LIBSCHROEDINGER 0
+%define CONFIG_LIBSPEEX 0
+%define CONFIG_LIBTHEORA 0
+%define CONFIG_LIBTWOLAME 0
+%define CONFIG_LIBVO_AACENC 0
+%define CONFIG_LIBVO_AMRWBENC 0
+%define CONFIG_LIBVORBIS 0
+%define CONFIG_LIBVPX 0
+%define CONFIG_LIBWAVPACK 0
+%define CONFIG_LIBWEBP 0
+%define CONFIG_LIBX264 0
+%define CONFIG_LIBX265 0
+%define CONFIG_LIBXAVS 0
+%define CONFIG_LIBXVID 0
+%define CONFIG_OPENSSL 0
+%define CONFIG_X11GRAB 0
+%define CONFIG_ZLIB 1
+%define CONFIG_GRAY 0
+%define CONFIG_HARDCODED_TABLES 0
+%define CONFIG_RUNTIME_CPUDETECT 0
+%define CONFIG_SAFE_BITSTREAM_READER 1
+%define CONFIG_SHARED 0
+%define CONFIG_SMALL 0
+%define CONFIG_SRAM 0
+%define CONFIG_STATIC 1
+%define CONFIG_SWSCALE_ALPHA 1
+%define CONFIG_DXVA2 0
+%define CONFIG_VAAPI 0
+%define CONFIG_VDA 0
+%define CONFIG_VDPAU 0
+%define CONFIG_GPL 0
+%define CONFIG_NONFREE 0
+%define CONFIG_VERSION3 0
+%define CONFIG_AVCODEC 1
+%define CONFIG_AVDEVICE 1
+%define CONFIG_AVFILTER 1
+%define CONFIG_AVFORMAT 1
+%define CONFIG_AVRESAMPLE 1
+%define CONFIG_AVUTIL 1
+%define CONFIG_SWSCALE 1
+%define CONFIG_AVCONV 1
+%define CONFIG_AVPLAY 0
+%define CONFIG_AVPROBE 1
+%define CONFIG_DCT 1
+%define CONFIG_DOC 1
+%define CONFIG_ERROR_RESILIENCE 1
+%define CONFIG_FFT 1
+%define CONFIG_LSP 1
+%define CONFIG_LZO 1
+%define CONFIG_MDCT 1
+%define CONFIG_NETWORK 1
+%define CONFIG_RDFT 1
+%define CONFIG_MEMALIGN_HACK 0
+%define CONFIG_NEON_CLOBBER_TEST 0
+%define CONFIG_PIC 0
+%define CONFIG_POD2MAN 1
+%define CONFIG_TEXI2HTML 0
+%define CONFIG_THUMB 0
+%define CONFIG_XMM_CLOBBER_TEST 0
+%define CONFIG_AANDCTTABLES 1
+%define CONFIG_AC3DSP 1
+%define CONFIG_AUDIO_FRAME_QUEUE 1
+%define CONFIG_AUDIODSP 1
+%define CONFIG_BLOCKDSP 1
+%define CONFIG_BSWAPDSP 1
+%define CONFIG_CABAC 1
+%define CONFIG_DVPROFILE 1
+%define CONFIG_FDCTDSP 1
+%define CONFIG_GCRYPT 0
+%define CONFIG_GOLOMB 1
+%define CONFIG_GPLV3 0
+%define CONFIG_H263DSP 1
+%define CONFIG_H264CHROMA 1
+%define CONFIG_H264DSP 1
+%define CONFIG_H264PRED 1
+%define CONFIG_H264QPEL 1
+%define CONFIG_HPELDSP 1
+%define CONFIG_HUFFMAN 1
+%define CONFIG_HUFFYUVDSP 1
+%define CONFIG_HUFFYUVENCDSP 1
+%define CONFIG_IDCTDSP 1
+%define CONFIG_IIRFILTER 1
+%define CONFIG_INTRAX8 1
+%define CONFIG_LGPLV3 0
+%define CONFIG_LPC 1
+%define CONFIG_ME_CMP 1
+%define CONFIG_MPEG_ER 1
+%define CONFIG_MPEGAUDIO 1
+%define CONFIG_MPEGAUDIODSP 1
+%define CONFIG_MPEGVIDEO 1
+%define CONFIG_MPEGVIDEOENC 1
+%define CONFIG_NETTLE 0
+%define CONFIG_PIXBLOCKDSP 1
+%define CONFIG_QPELDSP 1
+%define CONFIG_RANGECODER 1
+%define CONFIG_RIFFDEC 1
+%define CONFIG_RIFFENC 1
+%define CONFIG_RTPDEC 1
+%define CONFIG_RTPENC_CHAIN 1
+%define CONFIG_SINEWIN 1
+%define CONFIG_TPELDSP 1
+%define CONFIG_VIDEODSP 1
+%define CONFIG_VP3DSP 1
+%define CONFIG_AAC_ADTSTOASC_BSF 1
+%define CONFIG_CHOMP_BSF 1
+%define CONFIG_DUMP_EXTRADATA_BSF 1
+%define CONFIG_H264_MP4TOANNEXB_BSF 1
+%define CONFIG_IMX_DUMP_HEADER_BSF 1
+%define CONFIG_MJPEG2JPEG_BSF 1
+%define CONFIG_MJPEGA_DUMP_HEADER_BSF 1
+%define CONFIG_MOV2TEXTSUB_BSF 1
+%define CONFIG_NOISE_BSF 1
+%define CONFIG_REMOVE_EXTRADATA_BSF 1
+%define CONFIG_TEXT2MOVSUB_BSF 1
+%define CONFIG_AASC_DECODER 1
+%define CONFIG_AIC_DECODER 1
+%define CONFIG_ALIAS_PIX_DECODER 1
+%define CONFIG_AMV_DECODER 1
+%define CONFIG_ANM_DECODER 1
+%define CONFIG_ANSI_DECODER 1
+%define CONFIG_ASV1_DECODER 1
+%define CONFIG_ASV2_DECODER 1
+%define CONFIG_AURA_DECODER 1
+%define CONFIG_AURA2_DECODER 1
+%define CONFIG_AVS_DECODER 1
+%define CONFIG_BETHSOFTVID_DECODER 1
+%define CONFIG_BFI_DECODER 1
+%define CONFIG_BINK_DECODER 1
+%define CONFIG_BMP_DECODER 1
+%define CONFIG_BMV_VIDEO_DECODER 1
+%define CONFIG_BRENDER_PIX_DECODER 1
+%define CONFIG_C93_DECODER 1
+%define CONFIG_CAVS_DECODER 1
+%define CONFIG_CDGRAPHICS_DECODER 1
+%define CONFIG_CDXL_DECODER 1
+%define CONFIG_CINEPAK_DECODER 1
+%define CONFIG_CLJR_DECODER 1
+%define CONFIG_CLLC_DECODER 1
+%define CONFIG_COMFORTNOISE_DECODER 1
+%define CONFIG_CSCD_DECODER 1
+%define CONFIG_CYUV_DECODER 1
+%define CONFIG_DFA_DECODER 1
+%define CONFIG_DNXHD_DECODER 1
+%define CONFIG_DPX_DECODER 1
+%define CONFIG_DSICINVIDEO_DECODER 1
+%define CONFIG_DVVIDEO_DECODER 1
+%define CONFIG_DXA_DECODER 1
+%define CONFIG_DXTORY_DECODER 1
+%define CONFIG_EACMV_DECODER 1
+%define CONFIG_EAMAD_DECODER 1
+%define CONFIG_EATGQ_DECODER 1
+%define CONFIG_EATGV_DECODER 1
+%define CONFIG_EATQI_DECODER 1
+%define CONFIG_EIGHTBPS_DECODER 1
+%define CONFIG_EIGHTSVX_EXP_DECODER 1
+%define CONFIG_EIGHTSVX_FIB_DECODER 1
+%define CONFIG_ESCAPE124_DECODER 1
+%define CONFIG_ESCAPE130_DECODER 1
+%define CONFIG_EXR_DECODER 1
+%define CONFIG_FFV1_DECODER 1
+%define CONFIG_FFVHUFF_DECODER 1
+%define CONFIG_FIC_DECODER 1
+%define CONFIG_FLASHSV_DECODER 1
+%define CONFIG_FLASHSV2_DECODER 1
+%define CONFIG_FLIC_DECODER 1
+%define CONFIG_FLV_DECODER 1
+%define CONFIG_FOURXM_DECODER 1
+%define CONFIG_FRAPS_DECODER 1
+%define CONFIG_FRWU_DECODER 1
+%define CONFIG_G2M_DECODER 1
+%define CONFIG_GIF_DECODER 1
+%define CONFIG_H261_DECODER 1
+%define CONFIG_H263_DECODER 1
+%define CONFIG_H263I_DECODER 1
+%define CONFIG_H264_DECODER 1
+%define CONFIG_HEVC_DECODER 1
+%define CONFIG_HNM4_VIDEO_DECODER 1
+%define CONFIG_HUFFYUV_DECODER 1
+%define CONFIG_IDCIN_DECODER 1
+%define CONFIG_IFF_BYTERUN1_DECODER 1
+%define CONFIG_IFF_ILBM_DECODER 1
+%define CONFIG_INDEO2_DECODER 1
+%define CONFIG_INDEO3_DECODER 1
+%define CONFIG_INDEO4_DECODER 1
+%define CONFIG_INDEO5_DECODER 1
+%define CONFIG_INTERPLAY_VIDEO_DECODER 1
+%define CONFIG_JPEG2000_DECODER 1
+%define CONFIG_JPEGLS_DECODER 1
+%define CONFIG_JV_DECODER 1
+%define CONFIG_KGV1_DECODER 1
+%define CONFIG_KMVC_DECODER 1
+%define CONFIG_LAGARITH_DECODER 1
+%define CONFIG_LOCO_DECODER 1
+%define CONFIG_MDEC_DECODER 1
+%define CONFIG_MIMIC_DECODER 1
+%define CONFIG_MJPEG_DECODER 1
+%define CONFIG_MJPEGB_DECODER 1
+%define CONFIG_MMVIDEO_DECODER 1
+%define CONFIG_MOTIONPIXELS_DECODER 1
+%define CONFIG_MPEG_XVMC_DECODER 0
+%define CONFIG_MPEG1VIDEO_DECODER 1
+%define CONFIG_MPEG2VIDEO_DECODER 1
+%define CONFIG_MPEG4_DECODER 1
+%define CONFIG_MSA1_DECODER 1
+%define CONFIG_MSMPEG4V1_DECODER 1
+%define CONFIG_MSMPEG4V2_DECODER 1
+%define CONFIG_MSMPEG4V3_DECODER 1
+%define CONFIG_MSRLE_DECODER 1
+%define CONFIG_MSS1_DECODER 1
+%define CONFIG_MSS2_DECODER 1
+%define CONFIG_MSVIDEO1_DECODER 1
+%define CONFIG_MSZH_DECODER 1
+%define CONFIG_MTS2_DECODER 1
+%define CONFIG_MVC1_DECODER 1
+%define CONFIG_MVC2_DECODER 1
+%define CONFIG_MXPEG_DECODER 1
+%define CONFIG_NUV_DECODER 1
+%define CONFIG_PAF_VIDEO_DECODER 1
+%define CONFIG_PAM_DECODER 1
+%define CONFIG_PBM_DECODER 1
+%define CONFIG_PCX_DECODER 1
+%define CONFIG_PGM_DECODER 1
+%define CONFIG_PGMYUV_DECODER 1
+%define CONFIG_PICTOR_DECODER 1
+%define CONFIG_PNG_DECODER 1
+%define CONFIG_PPM_DECODER 1
+%define CONFIG_PRORES_DECODER 1
+%define CONFIG_PTX_DECODER 1
+%define CONFIG_QDRAW_DECODER 1
+%define CONFIG_QPEG_DECODER 1
+%define CONFIG_QTRLE_DECODER 1
+%define CONFIG_R10K_DECODER 1
+%define CONFIG_R210_DECODER 1
+%define CONFIG_RAWVIDEO_DECODER 1
+%define CONFIG_RL2_DECODER 1
+%define CONFIG_ROQ_DECODER 1
+%define CONFIG_RPZA_DECODER 1
+%define CONFIG_RV10_DECODER 1
+%define CONFIG_RV20_DECODER 1
+%define CONFIG_RV30_DECODER 1
+%define CONFIG_RV40_DECODER 1
+%define CONFIG_S302M_DECODER 1
+%define CONFIG_SANM_DECODER 1
+%define CONFIG_SGI_DECODER 1
+%define CONFIG_SGIRLE_DECODER 1
+%define CONFIG_SMACKER_DECODER 1
+%define CONFIG_SMC_DECODER 1
+%define CONFIG_SP5X_DECODER 1
+%define CONFIG_SUNRAST_DECODER 1
+%define CONFIG_SVQ1_DECODER 1
+%define CONFIG_SVQ3_DECODER 1
+%define CONFIG_TARGA_DECODER 1
+%define CONFIG_THEORA_DECODER 1
+%define CONFIG_THP_DECODER 1
+%define CONFIG_TIERTEXSEQVIDEO_DECODER 1
+%define CONFIG_TIFF_DECODER 1
+%define CONFIG_TMV_DECODER 1
+%define CONFIG_TRUEMOTION1_DECODER 1
+%define CONFIG_TRUEMOTION2_DECODER 1
+%define CONFIG_TSCC_DECODER 1
+%define CONFIG_TSCC2_DECODER 1
+%define CONFIG_TXD_DECODER 1
+%define CONFIG_ULTI_DECODER 1
+%define CONFIG_UTVIDEO_DECODER 1
+%define CONFIG_V210_DECODER 1
+%define CONFIG_V210X_DECODER 1
+%define CONFIG_V410_DECODER 1
+%define CONFIG_VB_DECODER 1
+%define CONFIG_VBLE_DECODER 1
+%define CONFIG_VC1_DECODER 1
+%define CONFIG_VC1IMAGE_DECODER 1
+%define CONFIG_VCR1_DECODER 1
+%define CONFIG_VMDVIDEO_DECODER 1
+%define CONFIG_VMNC_DECODER 1
+%define CONFIG_VP3_DECODER 1
+%define CONFIG_VP5_DECODER 1
+%define CONFIG_VP6_DECODER 1
+%define CONFIG_VP6A_DECODER 1
+%define CONFIG_VP6F_DECODER 1
+%define CONFIG_VP7_DECODER 1
+%define CONFIG_VP8_DECODER 1
+%define CONFIG_VP9_DECODER 1
+%define CONFIG_VQA_DECODER 1
+%define CONFIG_WEBP_DECODER 1
+%define CONFIG_WMV1_DECODER 1
+%define CONFIG_WMV2_DECODER 1
+%define CONFIG_WMV3_DECODER 1
+%define CONFIG_WMV3IMAGE_DECODER 1
+%define CONFIG_WNV1_DECODER 1
+%define CONFIG_XAN_WC3_DECODER 1
+%define CONFIG_XAN_WC4_DECODER 1
+%define CONFIG_XBM_DECODER 1
+%define CONFIG_XL_DECODER 1
+%define CONFIG_XWD_DECODER 1
+%define CONFIG_YOP_DECODER 1
+%define CONFIG_ZEROCODEC_DECODER 1
+%define CONFIG_ZLIB_DECODER 1
+%define CONFIG_ZMBV_DECODER 1
+%define CONFIG_AAC_DECODER 1
+%define CONFIG_AAC_LATM_DECODER 1
+%define CONFIG_AC3_DECODER 1
+%define CONFIG_ALAC_DECODER 1
+%define CONFIG_ALS_DECODER 1
+%define CONFIG_AMRNB_DECODER 1
+%define CONFIG_AMRWB_DECODER 1
+%define CONFIG_APE_DECODER 1
+%define CONFIG_ATRAC1_DECODER 1
+%define CONFIG_ATRAC3_DECODER 1
+%define CONFIG_ATRAC3P_DECODER 1
+%define CONFIG_BINKAUDIO_DCT_DECODER 1
+%define CONFIG_BINKAUDIO_RDFT_DECODER 1
+%define CONFIG_BMV_AUDIO_DECODER 1
+%define CONFIG_COOK_DECODER 1
+%define CONFIG_DCA_DECODER 1
+%define CONFIG_DSICINAUDIO_DECODER 1
+%define CONFIG_EAC3_DECODER 1
+%define CONFIG_FLAC_DECODER 1
+%define CONFIG_G723_1_DECODER 1
+%define CONFIG_GSM_DECODER 1
+%define CONFIG_GSM_MS_DECODER 1
+%define CONFIG_IAC_DECODER 1
+%define CONFIG_IMC_DECODER 1
+%define CONFIG_MACE3_DECODER 1
+%define CONFIG_MACE6_DECODER 1
+%define CONFIG_METASOUND_DECODER 1
+%define CONFIG_MLP_DECODER 1
+%define CONFIG_MP1_DECODER 1
+%define CONFIG_MP1FLOAT_DECODER 1
+%define CONFIG_MP2_DECODER 1
+%define CONFIG_MP2FLOAT_DECODER 1
+%define CONFIG_MP3_DECODER 1
+%define CONFIG_MP3FLOAT_DECODER 1
+%define CONFIG_MP3ADU_DECODER 1
+%define CONFIG_MP3ADUFLOAT_DECODER 1
+%define CONFIG_MP3ON4_DECODER 1
+%define CONFIG_MP3ON4FLOAT_DECODER 1
+%define CONFIG_MPC7_DECODER 1
+%define CONFIG_MPC8_DECODER 1
+%define CONFIG_NELLYMOSER_DECODER 1
+%define CONFIG_ON2AVC_DECODER 1
+%define CONFIG_OPUS_DECODER 1
+%define CONFIG_PAF_AUDIO_DECODER 1
+%define CONFIG_QCELP_DECODER 1
+%define CONFIG_QDM2_DECODER 1
+%define CONFIG_RA_144_DECODER 1
+%define CONFIG_RA_288_DECODER 1
+%define CONFIG_RALF_DECODER 1
+%define CONFIG_SHORTEN_DECODER 1
+%define CONFIG_SIPR_DECODER 1
+%define CONFIG_SMACKAUD_DECODER 1
+%define CONFIG_TAK_DECODER 1
+%define CONFIG_TRUEHD_DECODER 1
+%define CONFIG_TRUESPEECH_DECODER 1
+%define CONFIG_TTA_DECODER 1
+%define CONFIG_TWINVQ_DECODER 1
+%define CONFIG_VMDAUDIO_DECODER 1
+%define CONFIG_VORBIS_DECODER 1
+%define CONFIG_WAVPACK_DECODER 1
+%define CONFIG_WMALOSSLESS_DECODER 1
+%define CONFIG_WMAPRO_DECODER 1
+%define CONFIG_WMAV1_DECODER 1
+%define CONFIG_WMAV2_DECODER 1
+%define CONFIG_WMAVOICE_DECODER 1
+%define CONFIG_WS_SND1_DECODER 1
+%define CONFIG_PCM_ALAW_DECODER 1
+%define CONFIG_PCM_BLURAY_DECODER 1
+%define CONFIG_PCM_DVD_DECODER 1
+%define CONFIG_PCM_F32BE_DECODER 1
+%define CONFIG_PCM_F32LE_DECODER 1
+%define CONFIG_PCM_F64BE_DECODER 1
+%define CONFIG_PCM_F64LE_DECODER 1
+%define CONFIG_PCM_LXF_DECODER 1
+%define CONFIG_PCM_MULAW_DECODER 1
+%define CONFIG_PCM_S8_DECODER 1
+%define CONFIG_PCM_S8_PLANAR_DECODER 1
+%define CONFIG_PCM_S16BE_DECODER 1
+%define CONFIG_PCM_S16LE_DECODER 1
+%define CONFIG_PCM_S16LE_PLANAR_DECODER 1
+%define CONFIG_PCM_S24BE_DECODER 1
+%define CONFIG_PCM_S24DAUD_DECODER 1
+%define CONFIG_PCM_S24LE_DECODER 1
+%define CONFIG_PCM_S24LE_PLANAR_DECODER 1
+%define CONFIG_PCM_S32BE_DECODER 1
+%define CONFIG_PCM_S32LE_DECODER 1
+%define CONFIG_PCM_S32LE_PLANAR_DECODER 1
+%define CONFIG_PCM_U8_DECODER 1
+%define CONFIG_PCM_U16BE_DECODER 1
+%define CONFIG_PCM_U16LE_DECODER 1
+%define CONFIG_PCM_U24BE_DECODER 1
+%define CONFIG_PCM_U24LE_DECODER 1
+%define CONFIG_PCM_U32BE_DECODER 1
+%define CONFIG_PCM_U32LE_DECODER 1
+%define CONFIG_PCM_ZORK_DECODER 1
+%define CONFIG_INTERPLAY_DPCM_DECODER 1
+%define CONFIG_ROQ_DPCM_DECODER 1
+%define CONFIG_SOL_DPCM_DECODER 1
+%define CONFIG_XAN_DPCM_DECODER 1
+%define CONFIG_ADPCM_4XM_DECODER 1
+%define CONFIG_ADPCM_ADX_DECODER 1
+%define CONFIG_ADPCM_CT_DECODER 1
+%define CONFIG_ADPCM_EA_DECODER 1
+%define CONFIG_ADPCM_EA_MAXIS_XA_DECODER 1
+%define CONFIG_ADPCM_EA_R1_DECODER 1
+%define CONFIG_ADPCM_EA_R2_DECODER 1
+%define CONFIG_ADPCM_EA_R3_DECODER 1
+%define CONFIG_ADPCM_EA_XAS_DECODER 1
+%define CONFIG_ADPCM_G722_DECODER 1
+%define CONFIG_ADPCM_G726_DECODER 1
+%define CONFIG_ADPCM_IMA_AMV_DECODER 1
+%define CONFIG_ADPCM_IMA_APC_DECODER 1
+%define CONFIG_ADPCM_IMA_DK3_DECODER 1
+%define CONFIG_ADPCM_IMA_DK4_DECODER 1
+%define CONFIG_ADPCM_IMA_EA_EACS_DECODER 1
+%define CONFIG_ADPCM_IMA_EA_SEAD_DECODER 1
+%define CONFIG_ADPCM_IMA_ISS_DECODER 1
+%define CONFIG_ADPCM_IMA_QT_DECODER 1
+%define CONFIG_ADPCM_IMA_SMJPEG_DECODER 1
+%define CONFIG_ADPCM_IMA_WAV_DECODER 1
+%define CONFIG_ADPCM_IMA_WS_DECODER 1
+%define CONFIG_ADPCM_MS_DECODER 1
+%define CONFIG_ADPCM_SBPRO_2_DECODER 1
+%define CONFIG_ADPCM_SBPRO_3_DECODER 1
+%define CONFIG_ADPCM_SBPRO_4_DECODER 1
+%define CONFIG_ADPCM_SWF_DECODER 1
+%define CONFIG_ADPCM_THP_DECODER 1
+%define CONFIG_ADPCM_VIMA_DECODER 1
+%define CONFIG_ADPCM_XA_DECODER 1
+%define CONFIG_ADPCM_YAMAHA_DECODER 1
+%define CONFIG_ASS_DECODER 1
+%define CONFIG_DVBSUB_DECODER 1
+%define CONFIG_DVDSUB_DECODER 1
+%define CONFIG_PGSSUB_DECODER 1
+%define CONFIG_SRT_DECODER 1
+%define CONFIG_XSUB_DECODER 1
+%define CONFIG_LIBFDK_AAC_DECODER 0
+%define CONFIG_LIBGSM_DECODER 0
+%define CONFIG_LIBGSM_MS_DECODER 0
+%define CONFIG_LIBILBC_DECODER 0
+%define CONFIG_LIBOPENCORE_AMRNB_DECODER 0
+%define CONFIG_LIBOPENCORE_AMRWB_DECODER 0
+%define CONFIG_LIBOPENJPEG_DECODER 0
+%define CONFIG_LIBOPUS_DECODER 0
+%define CONFIG_LIBSCHROEDINGER_DECODER 0
+%define CONFIG_LIBSPEEX_DECODER 0
+%define CONFIG_LIBVPX_VP8_DECODER 0
+%define CONFIG_LIBVPX_VP9_DECODER 0
+%define CONFIG_AAC_DEMUXER 1
+%define CONFIG_AC3_DEMUXER 1
+%define CONFIG_ADX_DEMUXER 1
+%define CONFIG_AEA_DEMUXER 1
+%define CONFIG_AIFF_DEMUXER 1
+%define CONFIG_AMR_DEMUXER 1
+%define CONFIG_ANM_DEMUXER 1
+%define CONFIG_APC_DEMUXER 1
+%define CONFIG_APE_DEMUXER 1
+%define CONFIG_ASF_DEMUXER 1
+%define CONFIG_ASS_DEMUXER 1
+%define CONFIG_AU_DEMUXER 1
+%define CONFIG_AVI_DEMUXER 1
+%define CONFIG_AVISYNTH_DEMUXER 0
+%define CONFIG_AVS_DEMUXER 1
+%define CONFIG_BETHSOFTVID_DEMUXER 1
+%define CONFIG_BFI_DEMUXER 1
+%define CONFIG_BINK_DEMUXER 1
+%define CONFIG_BMV_DEMUXER 1
+%define CONFIG_C93_DEMUXER 1
+%define CONFIG_CAF_DEMUXER 1
+%define CONFIG_CAVSVIDEO_DEMUXER 1
+%define CONFIG_CDG_DEMUXER 1
+%define CONFIG_CDXL_DEMUXER 1
+%define CONFIG_DAUD_DEMUXER 1
+%define CONFIG_DFA_DEMUXER 1
+%define CONFIG_DIRAC_DEMUXER 1
+%define CONFIG_DNXHD_DEMUXER 1
+%define CONFIG_DSICIN_DEMUXER 1
+%define CONFIG_DTS_DEMUXER 1
+%define CONFIG_DV_DEMUXER 1
+%define CONFIG_DXA_DEMUXER 1
+%define CONFIG_EA_DEMUXER 1
+%define CONFIG_EA_CDATA_DEMUXER 1
+%define CONFIG_EAC3_DEMUXER 1
+%define CONFIG_FFMETADATA_DEMUXER 1
+%define CONFIG_FILMSTRIP_DEMUXER 1
+%define CONFIG_FLAC_DEMUXER 1
+%define CONFIG_FLIC_DEMUXER 1
+%define CONFIG_FLV_DEMUXER 1
+%define CONFIG_FOURXM_DEMUXER 1
+%define CONFIG_G722_DEMUXER 1
+%define CONFIG_G723_1_DEMUXER 1
+%define CONFIG_GSM_DEMUXER 1
+%define CONFIG_GXF_DEMUXER 1
+%define CONFIG_H261_DEMUXER 1
+%define CONFIG_H263_DEMUXER 1
+%define CONFIG_H264_DEMUXER 1
+%define CONFIG_HEVC_DEMUXER 1
+%define CONFIG_HLS_DEMUXER 1
+%define CONFIG_HNM_DEMUXER 1
+%define CONFIG_IDCIN_DEMUXER 1
+%define CONFIG_IFF_DEMUXER 1
+%define CONFIG_ILBC_DEMUXER 1
+%define CONFIG_IMAGE2_DEMUXER 1
+%define CONFIG_IMAGE2PIPE_DEMUXER 1
+%define CONFIG_INGENIENT_DEMUXER 1
+%define CONFIG_IPMOVIE_DEMUXER 1
+%define CONFIG_ISS_DEMUXER 1
+%define CONFIG_IV8_DEMUXER 1
+%define CONFIG_IVF_DEMUXER 1
+%define CONFIG_JV_DEMUXER 1
+%define CONFIG_LATM_DEMUXER 1
+%define CONFIG_LMLM4_DEMUXER 1
+%define CONFIG_LXF_DEMUXER 1
+%define CONFIG_M4V_DEMUXER 1
+%define CONFIG_MATROSKA_DEMUXER 1
+%define CONFIG_MJPEG_DEMUXER 1
+%define CONFIG_MLP_DEMUXER 1
+%define CONFIG_MM_DEMUXER 1
+%define CONFIG_MMF_DEMUXER 1
+%define CONFIG_MOV_DEMUXER 1
+%define CONFIG_MP3_DEMUXER 1
+%define CONFIG_MPC_DEMUXER 1
+%define CONFIG_MPC8_DEMUXER 1
+%define CONFIG_MPEGPS_DEMUXER 1
+%define CONFIG_MPEGTS_DEMUXER 1
+%define CONFIG_MPEGTSRAW_DEMUXER 1
+%define CONFIG_MPEGVIDEO_DEMUXER 1
+%define CONFIG_MSNWC_TCP_DEMUXER 1
+%define CONFIG_MTV_DEMUXER 1
+%define CONFIG_MV_DEMUXER 1
+%define CONFIG_MVI_DEMUXER 1
+%define CONFIG_MXF_DEMUXER 1
+%define CONFIG_MXG_DEMUXER 1
+%define CONFIG_NC_DEMUXER 1
+%define CONFIG_NSV_DEMUXER 1
+%define CONFIG_NUT_DEMUXER 1
+%define CONFIG_NUV_DEMUXER 1
+%define CONFIG_OGG_DEMUXER 1
+%define CONFIG_OMA_DEMUXER 1
+%define CONFIG_PAF_DEMUXER 1
+%define CONFIG_PCM_ALAW_DEMUXER 1
+%define CONFIG_PCM_MULAW_DEMUXER 1
+%define CONFIG_PCM_F64BE_DEMUXER 1
+%define CONFIG_PCM_F64LE_DEMUXER 1
+%define CONFIG_PCM_F32BE_DEMUXER 1
+%define CONFIG_PCM_F32LE_DEMUXER 1
+%define CONFIG_PCM_S32BE_DEMUXER 1
+%define CONFIG_PCM_S32LE_DEMUXER 1
+%define CONFIG_PCM_S24BE_DEMUXER 1
+%define CONFIG_PCM_S24LE_DEMUXER 1
+%define CONFIG_PCM_S16BE_DEMUXER 1
+%define CONFIG_PCM_S16LE_DEMUXER 1
+%define CONFIG_PCM_S8_DEMUXER 1
+%define CONFIG_PCM_U32BE_DEMUXER 1
+%define CONFIG_PCM_U32LE_DEMUXER 1
+%define CONFIG_PCM_U24BE_DEMUXER 1
+%define CONFIG_PCM_U24LE_DEMUXER 1
+%define CONFIG_PCM_U16BE_DEMUXER 1
+%define CONFIG_PCM_U16LE_DEMUXER 1
+%define CONFIG_PCM_U8_DEMUXER 1
+%define CONFIG_PMP_DEMUXER 1
+%define CONFIG_PVA_DEMUXER 1
+%define CONFIG_QCP_DEMUXER 1
+%define CONFIG_R3D_DEMUXER 1
+%define CONFIG_RAWVIDEO_DEMUXER 1
+%define CONFIG_RL2_DEMUXER 1
+%define CONFIG_RM_DEMUXER 1
+%define CONFIG_ROQ_DEMUXER 1
+%define CONFIG_RPL_DEMUXER 1
+%define CONFIG_RSO_DEMUXER 1
+%define CONFIG_RTP_DEMUXER 1
+%define CONFIG_RTSP_DEMUXER 1
+%define CONFIG_SAP_DEMUXER 1
+%define CONFIG_SDP_DEMUXER 1
+%define CONFIG_SEGAFILM_DEMUXER 1
+%define CONFIG_SHORTEN_DEMUXER 1
+%define CONFIG_SIFF_DEMUXER 1
+%define CONFIG_SMACKER_DEMUXER 1
+%define CONFIG_SMJPEG_DEMUXER 1
+%define CONFIG_SMUSH_DEMUXER 1
+%define CONFIG_SOL_DEMUXER 1
+%define CONFIG_SOX_DEMUXER 1
+%define CONFIG_SPDIF_DEMUXER 1
+%define CONFIG_SRT_DEMUXER 1
+%define CONFIG_STR_DEMUXER 1
+%define CONFIG_SWF_DEMUXER 1
+%define CONFIG_TAK_DEMUXER 1
+%define CONFIG_THP_DEMUXER 1
+%define CONFIG_TIERTEXSEQ_DEMUXER 1
+%define CONFIG_TMV_DEMUXER 1
+%define CONFIG_TRUEHD_DEMUXER 1
+%define CONFIG_TTA_DEMUXER 1
+%define CONFIG_TXD_DEMUXER 1
+%define CONFIG_TTY_DEMUXER 1
+%define CONFIG_VC1_DEMUXER 1
+%define CONFIG_VC1T_DEMUXER 1
+%define CONFIG_VMD_DEMUXER 1
+%define CONFIG_VOC_DEMUXER 1
+%define CONFIG_VQF_DEMUXER 1
+%define CONFIG_W64_DEMUXER 1
+%define CONFIG_WAV_DEMUXER 1
+%define CONFIG_WC3_DEMUXER 1
+%define CONFIG_WSAUD_DEMUXER 1
+%define CONFIG_WSVQA_DEMUXER 1
+%define CONFIG_WTV_DEMUXER 1
+%define CONFIG_WV_DEMUXER 1
+%define CONFIG_XA_DEMUXER 1
+%define CONFIG_XMV_DEMUXER 1
+%define CONFIG_XWMA_DEMUXER 1
+%define CONFIG_YOP_DEMUXER 1
+%define CONFIG_YUV4MPEGPIPE_DEMUXER 1
+%define CONFIG_A64MULTI_ENCODER 1
+%define CONFIG_A64MULTI5_ENCODER 1
+%define CONFIG_ALIAS_PIX_ENCODER 1
+%define CONFIG_ASV1_ENCODER 1
+%define CONFIG_ASV2_ENCODER 1
+%define CONFIG_BMP_ENCODER 1
+%define CONFIG_CLJR_ENCODER 1
+%define CONFIG_COMFORTNOISE_ENCODER 1
+%define CONFIG_DNXHD_ENCODER 1
+%define CONFIG_DPX_ENCODER 1
+%define CONFIG_DVVIDEO_ENCODER 1
+%define CONFIG_FFV1_ENCODER 1
+%define CONFIG_FFVHUFF_ENCODER 1
+%define CONFIG_FLASHSV_ENCODER 1
+%define CONFIG_FLV_ENCODER 1
+%define CONFIG_GIF_ENCODER 1
+%define CONFIG_H261_ENCODER 1
+%define CONFIG_H263_ENCODER 1
+%define CONFIG_H263P_ENCODER 1
+%define CONFIG_HUFFYUV_ENCODER 1
+%define CONFIG_JPEGLS_ENCODER 1
+%define CONFIG_LJPEG_ENCODER 1
+%define CONFIG_MJPEG_ENCODER 1
+%define CONFIG_MPEG1VIDEO_ENCODER 1
+%define CONFIG_MPEG2VIDEO_ENCODER 1
+%define CONFIG_MPEG4_ENCODER 1
+%define CONFIG_MSMPEG4V2_ENCODER 1
+%define CONFIG_MSMPEG4V3_ENCODER 1
+%define CONFIG_PAM_ENCODER 1
+%define CONFIG_PBM_ENCODER 1
+%define CONFIG_PCX_ENCODER 1
+%define CONFIG_PGM_ENCODER 1
+%define CONFIG_PGMYUV_ENCODER 1
+%define CONFIG_PNG_ENCODER 1
+%define CONFIG_PPM_ENCODER 1
+%define CONFIG_PRORES_ENCODER 1
+%define CONFIG_QTRLE_ENCODER 1
+%define CONFIG_RAWVIDEO_ENCODER 1
+%define CONFIG_ROQ_ENCODER 1
+%define CONFIG_RV10_ENCODER 1
+%define CONFIG_RV20_ENCODER 1
+%define CONFIG_SGI_ENCODER 1
+%define CONFIG_SUNRAST_ENCODER 1
+%define CONFIG_SVQ1_ENCODER 1
+%define CONFIG_TARGA_ENCODER 1
+%define CONFIG_LIBTWOLAME_ENCODER 0
+%define CONFIG_TIFF_ENCODER 1
+%define CONFIG_UTVIDEO_ENCODER 1
+%define CONFIG_V210_ENCODER 1
+%define CONFIG_V410_ENCODER 1
+%define CONFIG_WMV1_ENCODER 1
+%define CONFIG_WMV2_ENCODER 1
+%define CONFIG_XBM_ENCODER 1
+%define CONFIG_XWD_ENCODER 1
+%define CONFIG_ZLIB_ENCODER 1
+%define CONFIG_ZMBV_ENCODER 1
+%define CONFIG_AAC_ENCODER 1
+%define CONFIG_AC3_ENCODER 1
+%define CONFIG_AC3_FIXED_ENCODER 1
+%define CONFIG_ALAC_ENCODER 1
+%define CONFIG_EAC3_ENCODER 1
+%define CONFIG_FLAC_ENCODER 1
+%define CONFIG_MP2_ENCODER 1
+%define CONFIG_NELLYMOSER_ENCODER 1
+%define CONFIG_RA_144_ENCODER 1
+%define CONFIG_VORBIS_ENCODER 1
+%define CONFIG_WMAV1_ENCODER 1
+%define CONFIG_WMAV2_ENCODER 1
+%define CONFIG_PCM_ALAW_ENCODER 1
+%define CONFIG_PCM_F32BE_ENCODER 1
+%define CONFIG_PCM_F32LE_ENCODER 1
+%define CONFIG_PCM_F64BE_ENCODER 1
+%define CONFIG_PCM_F64LE_ENCODER 1
+%define CONFIG_PCM_MULAW_ENCODER 1
+%define CONFIG_PCM_S8_ENCODER 1
+%define CONFIG_PCM_S16BE_ENCODER 1
+%define CONFIG_PCM_S16LE_ENCODER 1
+%define CONFIG_PCM_S24BE_ENCODER 1
+%define CONFIG_PCM_S24DAUD_ENCODER 1
+%define CONFIG_PCM_S24LE_ENCODER 1
+%define CONFIG_PCM_S32BE_ENCODER 1
+%define CONFIG_PCM_S32LE_ENCODER 1
+%define CONFIG_PCM_U8_ENCODER 1
+%define CONFIG_PCM_U16BE_ENCODER 1
+%define CONFIG_PCM_U16LE_ENCODER 1
+%define CONFIG_PCM_U24BE_ENCODER 1
+%define CONFIG_PCM_U24LE_ENCODER 1
+%define CONFIG_PCM_U32BE_ENCODER 1
+%define CONFIG_PCM_U32LE_ENCODER 1
+%define CONFIG_ROQ_DPCM_ENCODER 1
+%define CONFIG_ADPCM_ADX_ENCODER 1
+%define CONFIG_ADPCM_G722_ENCODER 1
+%define CONFIG_ADPCM_G726_ENCODER 1
+%define CONFIG_ADPCM_IMA_QT_ENCODER 1
+%define CONFIG_ADPCM_IMA_WAV_ENCODER 1
+%define CONFIG_ADPCM_MS_ENCODER 1
+%define CONFIG_ADPCM_SWF_ENCODER 1
+%define CONFIG_ADPCM_YAMAHA_ENCODER 1
+%define CONFIG_ASS_ENCODER 1
+%define CONFIG_DVBSUB_ENCODER 1
+%define CONFIG_DVDSUB_ENCODER 1
+%define CONFIG_XSUB_ENCODER 1
+%define CONFIG_LIBFAAC_ENCODER 0
+%define CONFIG_LIBFDK_AAC_ENCODER 0
+%define CONFIG_LIBGSM_ENCODER 0
+%define CONFIG_LIBGSM_MS_ENCODER 0
+%define CONFIG_LIBILBC_ENCODER 0
+%define CONFIG_LIBMP3LAME_ENCODER 0
+%define CONFIG_LIBOPENCORE_AMRNB_ENCODER 0
+%define CONFIG_LIBOPENJPEG_ENCODER 0
+%define CONFIG_LIBOPUS_ENCODER 0
+%define CONFIG_LIBSCHROEDINGER_ENCODER 0
+%define CONFIG_LIBSPEEX_ENCODER 0
+%define CONFIG_LIBTHEORA_ENCODER 0
+%define CONFIG_LIBVO_AACENC_ENCODER 0
+%define CONFIG_LIBVO_AMRWBENC_ENCODER 0
+%define CONFIG_LIBVORBIS_ENCODER 0
+%define CONFIG_LIBVPX_VP8_ENCODER 0
+%define CONFIG_LIBVPX_VP9_ENCODER 0
+%define CONFIG_LIBWAVPACK_ENCODER 0
+%define CONFIG_LIBWEBP_ENCODER 0
+%define CONFIG_LIBX264_ENCODER 0
+%define CONFIG_LIBX265_ENCODER 0
+%define CONFIG_LIBXAVS_ENCODER 0
+%define CONFIG_LIBXVID_ENCODER 0
+%define CONFIG_AFORMAT_FILTER 1
+%define CONFIG_AMIX_FILTER 1
+%define CONFIG_ANULL_FILTER 1
+%define CONFIG_ASETPTS_FILTER 1
+%define CONFIG_ASETTB_FILTER 1
+%define CONFIG_ASHOWINFO_FILTER 1
+%define CONFIG_ASPLIT_FILTER 1
+%define CONFIG_ASYNCTS_FILTER 1
+%define CONFIG_ATRIM_FILTER 1
+%define CONFIG_BS2B_FILTER 0
+%define CONFIG_CHANNELMAP_FILTER 1
+%define CONFIG_CHANNELSPLIT_FILTER 1
+%define CONFIG_COMPAND_FILTER 1
+%define CONFIG_JOIN_FILTER 1
+%define CONFIG_RESAMPLE_FILTER 1
+%define CONFIG_VOLUME_FILTER 1
+%define CONFIG_ANULLSRC_FILTER 1
+%define CONFIG_ANULLSINK_FILTER 1
+%define CONFIG_BLACKFRAME_FILTER 0
+%define CONFIG_BOXBLUR_FILTER 0
+%define CONFIG_COPY_FILTER 1
+%define CONFIG_CROP_FILTER 1
+%define CONFIG_CROPDETECT_FILTER 0
+%define CONFIG_DELOGO_FILTER 0
+%define CONFIG_DRAWBOX_FILTER 1
+%define CONFIG_DRAWTEXT_FILTER 0
+%define CONFIG_FADE_FILTER 1
+%define CONFIG_FIELDORDER_FILTER 1
+%define CONFIG_FORMAT_FILTER 1
+%define CONFIG_FPS_FILTER 1
+%define CONFIG_FRAMEPACK_FILTER 1
+%define CONFIG_FREI0R_FILTER 0
+%define CONFIG_GRADFUN_FILTER 1
+%define CONFIG_HFLIP_FILTER 1
+%define CONFIG_HQDN3D_FILTER 0
+%define CONFIG_INTERLACE_FILTER 0
+%define CONFIG_LUT_FILTER 1
+%define CONFIG_LUTRGB_FILTER 1
+%define CONFIG_LUTYUV_FILTER 1
+%define CONFIG_NEGATE_FILTER 1
+%define CONFIG_NOFORMAT_FILTER 1
+%define CONFIG_NULL_FILTER 1
+%define CONFIG_OCV_FILTER 0
+%define CONFIG_OVERLAY_FILTER 1
+%define CONFIG_PAD_FILTER 1
+%define CONFIG_PIXDESCTEST_FILTER 1
+%define CONFIG_SCALE_FILTER 1
+%define CONFIG_SELECT_FILTER 1
+%define CONFIG_SETDAR_FILTER 1
+%define CONFIG_SETPTS_FILTER 1
+%define CONFIG_SETSAR_FILTER 1
+%define CONFIG_SETTB_FILTER 1
+%define CONFIG_SHOWINFO_FILTER 1
+%define CONFIG_SHUFFLEPLANES_FILTER 1
+%define CONFIG_SPLIT_FILTER 1
+%define CONFIG_TRANSPOSE_FILTER 1
+%define CONFIG_TRIM_FILTER 1
+%define CONFIG_UNSHARP_FILTER 1
+%define CONFIG_VFLIP_FILTER 1
+%define CONFIG_YADIF_FILTER 1
+%define CONFIG_COLOR_FILTER 1
+%define CONFIG_FREI0R_SRC_FILTER 0
+%define CONFIG_MOVIE_FILTER 1
+%define CONFIG_NULLSRC_FILTER 1
+%define CONFIG_RGBTESTSRC_FILTER 1
+%define CONFIG_TESTSRC_FILTER 1
+%define CONFIG_NULLSINK_FILTER 1
+%define CONFIG_H263_VAAPI_HWACCEL 0
+%define CONFIG_H263_VDPAU_HWACCEL 0
+%define CONFIG_H264_DXVA2_HWACCEL 0
+%define CONFIG_H264_VAAPI_HWACCEL 0
+%define CONFIG_H264_VDA_HWACCEL 0
+%define CONFIG_H264_VDA_OLD_HWACCEL 0
+%define CONFIG_H264_VDPAU_HWACCEL 0
+%define CONFIG_MPEG1_VDPAU_HWACCEL 0
+%define CONFIG_MPEG2_DXVA2_HWACCEL 0
+%define CONFIG_MPEG2_VAAPI_HWACCEL 0
+%define CONFIG_MPEG2_VDPAU_HWACCEL 0
+%define CONFIG_MPEG4_VAAPI_HWACCEL 0
+%define CONFIG_MPEG4_VDPAU_HWACCEL 0
+%define CONFIG_VC1_DXVA2_HWACCEL 0
+%define CONFIG_VC1_VAAPI_HWACCEL 0
+%define CONFIG_VC1_VDPAU_HWACCEL 0
+%define CONFIG_WMV3_DXVA2_HWACCEL 0
+%define CONFIG_WMV3_VAAPI_HWACCEL 0
+%define CONFIG_WMV3_VDPAU_HWACCEL 0
+%define CONFIG_ALSA_INDEV 1
+%define CONFIG_BKTR_INDEV 0
+%define CONFIG_DV1394_INDEV 1
+%define CONFIG_FBDEV_INDEV 1
+%define CONFIG_JACK_INDEV 0
+%define CONFIG_OSS_INDEV 1
+%define CONFIG_PULSE_INDEV 0
+%define CONFIG_SNDIO_INDEV 0
+%define CONFIG_V4L2_INDEV 1
+%define CONFIG_VFWCAP_INDEV 0
+%define CONFIG_X11GRAB_INDEV 0
+%define CONFIG_LIBCDIO_INDEV 0
+%define CONFIG_LIBDC1394_INDEV 0
+%define CONFIG_A64_MUXER 1
+%define CONFIG_AC3_MUXER 1
+%define CONFIG_ADTS_MUXER 1
+%define CONFIG_ADX_MUXER 1
+%define CONFIG_AIFF_MUXER 1
+%define CONFIG_AMR_MUXER 1
+%define CONFIG_ASF_MUXER 1
+%define CONFIG_ASS_MUXER 1
+%define CONFIG_ASF_STREAM_MUXER 1
+%define CONFIG_AU_MUXER 1
+%define CONFIG_AVI_MUXER 1
+%define CONFIG_AVM2_MUXER 1
+%define CONFIG_CAVSVIDEO_MUXER 1
+%define CONFIG_CRC_MUXER 1
+%define CONFIG_DAUD_MUXER 1
+%define CONFIG_DIRAC_MUXER 1
+%define CONFIG_DNXHD_MUXER 1
+%define CONFIG_DTS_MUXER 1
+%define CONFIG_DV_MUXER 1
+%define CONFIG_EAC3_MUXER 1
+%define CONFIG_F4V_MUXER 1
+%define CONFIG_FFMETADATA_MUXER 1
+%define CONFIG_FILMSTRIP_MUXER 1
+%define CONFIG_FLAC_MUXER 1
+%define CONFIG_FLV_MUXER 1
+%define CONFIG_FRAMECRC_MUXER 1
+%define CONFIG_FRAMEMD5_MUXER 1
+%define CONFIG_G722_MUXER 1
+%define CONFIG_GIF_MUXER 1
+%define CONFIG_GXF_MUXER 1
+%define CONFIG_H261_MUXER 1
+%define CONFIG_H263_MUXER 1
+%define CONFIG_H264_MUXER 1
+%define CONFIG_HDS_MUXER 1
+%define CONFIG_HEVC_MUXER 1
+%define CONFIG_HLS_MUXER 1
+%define CONFIG_ILBC_MUXER 1
+%define CONFIG_IMAGE2_MUXER 1
+%define CONFIG_IMAGE2PIPE_MUXER 1
+%define CONFIG_IPOD_MUXER 1
+%define CONFIG_ISMV_MUXER 1
+%define CONFIG_IVF_MUXER 1
+%define CONFIG_LATM_MUXER 1
+%define CONFIG_M4V_MUXER 1
+%define CONFIG_MD5_MUXER 1
+%define CONFIG_MATROSKA_MUXER 1
+%define CONFIG_MATROSKA_AUDIO_MUXER 1
+%define CONFIG_MJPEG_MUXER 1
+%define CONFIG_MLP_MUXER 1
+%define CONFIG_MMF_MUXER 1
+%define CONFIG_MOV_MUXER 1
+%define CONFIG_MP2_MUXER 1
+%define CONFIG_MP3_MUXER 1
+%define CONFIG_MP4_MUXER 1
+%define CONFIG_MPEG1SYSTEM_MUXER 1
+%define CONFIG_MPEG1VCD_MUXER 1
+%define CONFIG_MPEG1VIDEO_MUXER 1
+%define CONFIG_MPEG2DVD_MUXER 1
+%define CONFIG_MPEG2SVCD_MUXER 1
+%define CONFIG_MPEG2VIDEO_MUXER 1
+%define CONFIG_MPEG2VOB_MUXER 1
+%define CONFIG_MPEGTS_MUXER 1
+%define CONFIG_MPJPEG_MUXER 1
+%define CONFIG_MXF_MUXER 1
+%define CONFIG_MXF_D10_MUXER 1
+%define CONFIG_NULL_MUXER 1
+%define CONFIG_NUT_MUXER 1
+%define CONFIG_OGG_MUXER 1
+%define CONFIG_OMA_MUXER 1
+%define CONFIG_PCM_ALAW_MUXER 1
+%define CONFIG_PCM_MULAW_MUXER 1
+%define CONFIG_PCM_F64BE_MUXER 1
+%define CONFIG_PCM_F64LE_MUXER 1
+%define CONFIG_PCM_F32BE_MUXER 1
+%define CONFIG_PCM_F32LE_MUXER 1
+%define CONFIG_PCM_S32BE_MUXER 1
+%define CONFIG_PCM_S32LE_MUXER 1
+%define CONFIG_PCM_S24BE_MUXER 1
+%define CONFIG_PCM_S24LE_MUXER 1
+%define CONFIG_PCM_S16BE_MUXER 1
+%define CONFIG_PCM_S16LE_MUXER 1
+%define CONFIG_PCM_S8_MUXER 1
+%define CONFIG_PCM_U32BE_MUXER 1
+%define CONFIG_PCM_U32LE_MUXER 1
+%define CONFIG_PCM_U24BE_MUXER 1
+%define CONFIG_PCM_U24LE_MUXER 1
+%define CONFIG_PCM_U16BE_MUXER 1
+%define CONFIG_PCM_U16LE_MUXER 1
+%define CONFIG_PCM_U8_MUXER 1
+%define CONFIG_PSP_MUXER 1
+%define CONFIG_RAWVIDEO_MUXER 1
+%define CONFIG_RM_MUXER 1
+%define CONFIG_ROQ_MUXER 1
+%define CONFIG_RSO_MUXER 1
+%define CONFIG_RTP_MUXER 1
+%define CONFIG_RTSP_MUXER 1
+%define CONFIG_SAP_MUXER 1
+%define CONFIG_SEGMENT_MUXER 1
+%define CONFIG_SMJPEG_MUXER 1
+%define CONFIG_SMOOTHSTREAMING_MUXER 1
+%define CONFIG_SOX_MUXER 1
+%define CONFIG_SPDIF_MUXER 1
+%define CONFIG_SRT_MUXER 1
+%define CONFIG_SWF_MUXER 1
+%define CONFIG_TG2_MUXER 1
+%define CONFIG_TGP_MUXER 1
+%define CONFIG_TRUEHD_MUXER 1
+%define CONFIG_VC1T_MUXER 1
+%define CONFIG_VOC_MUXER 1
+%define CONFIG_WAV_MUXER 1
+%define CONFIG_WEBM_MUXER 1
+%define CONFIG_WV_MUXER 1
+%define CONFIG_YUV4MPEGPIPE_MUXER 1
+%define CONFIG_ALSA_OUTDEV 1
+%define CONFIG_OSS_OUTDEV 1
+%define CONFIG_SNDIO_OUTDEV 0
+%define CONFIG_AAC_PARSER 1
+%define CONFIG_AAC_LATM_PARSER 1
+%define CONFIG_AC3_PARSER 1
+%define CONFIG_ADX_PARSER 1
+%define CONFIG_BMP_PARSER 1
+%define CONFIG_CAVSVIDEO_PARSER 1
+%define CONFIG_COOK_PARSER 1
+%define CONFIG_DCA_PARSER 1
+%define CONFIG_DIRAC_PARSER 1
+%define CONFIG_DNXHD_PARSER 1
+%define CONFIG_DVBSUB_PARSER 1
+%define CONFIG_DVDSUB_PARSER 1
+%define CONFIG_FLAC_PARSER 1
+%define CONFIG_GSM_PARSER 1
+%define CONFIG_H261_PARSER 1
+%define CONFIG_H263_PARSER 1
+%define CONFIG_H264_PARSER 1
+%define CONFIG_HEVC_PARSER 1
+%define CONFIG_MJPEG_PARSER 1
+%define CONFIG_MLP_PARSER 1
+%define CONFIG_MPEG4VIDEO_PARSER 1
+%define CONFIG_MPEGAUDIO_PARSER 1
+%define CONFIG_MPEGVIDEO_PARSER 1
+%define CONFIG_OPUS_PARSER 1
+%define CONFIG_PNG_PARSER 1
+%define CONFIG_PNM_PARSER 1
+%define CONFIG_RV30_PARSER 1
+%define CONFIG_RV40_PARSER 1
+%define CONFIG_TAK_PARSER 1
+%define CONFIG_VC1_PARSER 1
+%define CONFIG_VORBIS_PARSER 1
+%define CONFIG_VP3_PARSER 1
+%define CONFIG_VP8_PARSER 1
+%define CONFIG_CONCAT_PROTOCOL 1
+%define CONFIG_CRYPTO_PROTOCOL 1
+%define CONFIG_FFRTMPCRYPT_PROTOCOL 0
+%define CONFIG_FFRTMPHTTP_PROTOCOL 1
+%define CONFIG_FILE_PROTOCOL 1
+%define CONFIG_GOPHER_PROTOCOL 1
+%define CONFIG_HLS_PROTOCOL 1
+%define CONFIG_HTTP_PROTOCOL 1
+%define CONFIG_HTTPPROXY_PROTOCOL 1
+%define CONFIG_HTTPS_PROTOCOL 0
+%define CONFIG_MMSH_PROTOCOL 1
+%define CONFIG_MMST_PROTOCOL 1
+%define CONFIG_MD5_PROTOCOL 1
+%define CONFIG_PIPE_PROTOCOL 1
+%define CONFIG_RTMP_PROTOCOL 1
+%define CONFIG_RTMPE_PROTOCOL 0
+%define CONFIG_RTMPS_PROTOCOL 0
+%define CONFIG_RTMPT_PROTOCOL 1
+%define CONFIG_RTMPTE_PROTOCOL 0
+%define CONFIG_RTMPTS_PROTOCOL 0
+%define CONFIG_RTP_PROTOCOL 1
+%define CONFIG_SCTP_PROTOCOL 0
+%define CONFIG_SRTP_PROTOCOL 1
+%define CONFIG_TCP_PROTOCOL 1
+%define CONFIG_TLS_PROTOCOL 0
+%define CONFIG_UDP_PROTOCOL 1
+%define CONFIG_UNIX_PROTOCOL 1
+%define CONFIG_LIBRTMP_PROTOCOL 0
+%define CONFIG_LIBRTMPE_PROTOCOL 0
+%define CONFIG_LIBRTMPS_PROTOCOL 0
+%define CONFIG_LIBRTMPT_PROTOCOL 0
+%define CONFIG_LIBRTMPTE_PROTOCOL 0
diff --git a/src/expansion.asm b/src/expansion.asm
new file mode 100644
index 0000000..4ac77bf
--- /dev/null
+++ b/src/expansion.asm
@@ -0,0 +1,91 @@
+%include "x86util.asm"
+
+SECTION .text
+
+;                           len1 len2
+; compute vec2^T·mat·vec1 =  ∑    ∑   mat[i, j] vec1[i] vec2[j]
+;                           i=1  j=1
+%macro SCALARPRODUCT_METRIC 0
+cglobal scalarproduct_metric, 5, 7, 7, len1, len2, mat, vec1, vec2, rowpos
+    shl len2q, 3
+    shl len1q, 3
+
+    add vec1q, len1q
+    add vec2q, len2q
+    neg len2q
+
+    lea r6, [3 * len1q]
+
+    xorpd m0, m0
+
+.loop_2
+    mov rowposq, len1q
+    neg rowposq
+
+    xorpd m1, m1
+    xorpd m2, m2
+
+%if mmsize == 32
+    xorpd m3, m3
+    xorpd m4, m4
+%endif
+
+.loop_1
+    mova   m5, [vec1q + rowposq]
+
+%if mmsize == 32
+    FMULADD_PD m4, m5, [matq + r6q],       m4, m6
+    FMULADD_PD m3, m5, [matq + 2 * len1q], m3, m6
+%endif
+
+    FMULADD_PD m2, m5, [matq + 1 * len1q], m2, m6
+    FMULADD_PD m1, m5, [matq + 0 * len1q], m1, m6
+
+    add matq,    mmsize
+    add rowposq, mmsize
+    js .loop_1
+
+    haddpd       m1, m2
+
+%if mmsize == 32
+    vextractf128 xmm2, ymm1, 1
+    addpd        xmm1, xmm2
+
+    haddpd       m3, m4
+    vextractf128 xmm4, ymm3, 1
+    addpd        xmm3, xmm4
+
+    vinsertf128 ymm1, ymm1, xmm3, 1
+%endif
+
+    FMULADD_PD m0, m1, [vec2q + len2q], m0, m6
+
+%if mmsize == 32
+    add matq, r6
+%else
+    add matq, len1q
+%endif
+    add len2q, mmsize
+    js .loop_2
+
+    haddpd m0, m0
+
+%if mmsize == 32
+    vextractf128 xmm1, ymm0, 1
+    addpd xmm0, xmm1
+%endif
+
+    emms
+
+    RET
+%endmacro
+
+INIT_XMM sse3
+SCALARPRODUCT_METRIC
+
+INIT_YMM avx
+SCALARPRODUCT_METRIC
+
+INIT_YMM fma3
+SCALARPRODUCT_METRIC
+
diff --git a/src/gamma_freeze_template.c b/src/gamma_freeze_template.c
new file mode 100644
index 0000000..8edda4d
--- /dev/null
+++ b/src/gamma_freeze_template.c
@@ -0,0 +1,507 @@
+/*
+ * Minimal distortion -- template for the equations definitions
+ * Copyright (C) 2016 Anton Khirnov <anton@khirnov.net>
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#define FUNC3(a, b) a ## _ ## b
+#define FUNC2(a, b) FUNC3(a, b)
+#define FUNC(name) FUNC2(name, EQUATION)
+
+/**
+ * A template for calculating the equation coefficients.
+ */
+static void FUNC(calc_eq_coeffs)(void *arg,
+                                 unsigned int job_idx,    unsigned int nb_jobs,
+                                 unsigned int thread_idx, unsigned int nb_threads)
+{
+    const MDCalcEqThread   *et = arg;
+    const MDSolver        *ctx = et->ctx;
+    MDEquationContext  *eq_ctx = et->eq_ctx;
+
+    const int start = job_idx * et->block_size;
+    const int end   = MIN((job_idx + 1) * et->block_size, NB_COLLOC_POINTS(ctx));
+
+    for (int i = start; i < end; i++) {
+        const double x = eq_ctx->interp_coords[0][i];
+        const double z = eq_ctx->interp_coords[2][i];
+        const int zaxis = x <= EPS;
+
+        double c1o3 = (1.0 / 3.0);
+
+        double gtu[3][3], g[3][3], gu[3][3];
+        double dg[3][3][3], d2g[3][3][3][3], dgu[3][3][3], dgtu[3][3][3], G[3][3][3], dG[3][3][3][3];
+        double Gt[3][3][3];
+        double dXt[3][3];
+        double A[3][3], Au[3][3], Atu[3][3];
+        double dA[3][3][3], dAu[3][3][3];
+        double Ric[3][3], Ricm[3][3];
+        double rhs_x, rhs_z;
+
+        const double gtxx = eq_ctx->interp_values[I_GTXX][i];
+        const double gtyy = eq_ctx->interp_values[I_GTYY][i];
+        const double gtzz = eq_ctx->interp_values[I_GTZZ][i];
+        const double gtxy = eq_ctx->interp_values[I_GTXY][i];
+        const double gtxz = eq_ctx->interp_values[I_GTXZ][i];
+        const double gtyz = eq_ctx->interp_values[I_GTYZ][i];
+
+        const double gt[3][3] = {{ gtxx, gtxy, gtxz },
+                                 { gtxy, gtyy, gtyz },
+                                 { gtxz, gtyz, gtzz }};
+
+        const double dx_gt11 = eq_ctx->interp_values[I_GTXX_DX][i];
+        const double dx_gt22 = eq_ctx->interp_values[I_GTYY_DX][i];
+        const double dx_gt33 = eq_ctx->interp_values[I_GTZZ_DX][i];
+        const double dx_gt13 = eq_ctx->interp_values[I_GTXZ_DX][i];
+
+        const double dz_gt11 = eq_ctx->interp_values[I_GTXX_DZ][i];
+        const double dz_gt22 = eq_ctx->interp_values[I_GTYY_DZ][i];
+        const double dz_gt33 = eq_ctx->interp_values[I_GTZZ_DZ][i];
+        const double dz_gt13 = eq_ctx->interp_values[I_GTXZ_DZ][i];
+
+        const double dgt[3][3][3] = {
+            {
+                { dx_gt11,     0.0, dx_gt13 },
+                {     0.0, dx_gt22,     0.0 },
+                { dx_gt13,     0.0, dx_gt33 },
+            },
+            {
+                {     0.0, zaxis ? dx_gt11 - dx_gt22 : (gtxx - gtyy) / x, 0.0 },
+                { zaxis ? dx_gt11 - dx_gt22 : (gtxx - gtyy) / x, 0.0, zaxis ? dx_gt13 : gtxz / x },
+                { 0.0, zaxis ? dx_gt13 : gtxz / x, 0.0 },
+            },
+            {
+                { dz_gt11,     0.0, dz_gt13 },
+                {     0.0, dz_gt22,     0.0 },
+                { dz_gt13,     0.0, dz_gt33 },
+            },
+        };
+
+        const double dxx_gt11 = eq_ctx->interp_values[I_GTXX_DXX][i];
+        const double dxx_gt22 = eq_ctx->interp_values[I_GTYY_DXX][i];
+        const double dxx_gt33 = eq_ctx->interp_values[I_GTZZ_DXX][i];
+        const double dxx_gt13 = eq_ctx->interp_values[I_GTXZ_DXX][i];
+
+        const double dxz_gt11 = eq_ctx->interp_values[I_GTXX_DXZ][i];
+        const double dxz_gt22 = eq_ctx->interp_values[I_GTYY_DXZ][i];
+        const double dxz_gt33 = eq_ctx->interp_values[I_GTZZ_DXZ][i];
+        const double dxz_gt13 = eq_ctx->interp_values[I_GTXZ_DXZ][i];
+
+        const double dzz_gt11 = eq_ctx->interp_values[I_GTXX_DZZ][i];
+        const double dzz_gt22 = eq_ctx->interp_values[I_GTYY_DZZ][i];
+        const double dzz_gt33 = eq_ctx->interp_values[I_GTZZ_DZZ][i];
+        const double dzz_gt13 = eq_ctx->interp_values[I_GTXZ_DZZ][i];
+
+        const double d2gt[3][3][3][3] = {
+            {
+                {
+                    { dxx_gt11,      0.0, dxx_gt13 },
+                    { 0.0,      dxx_gt22,      0.0 },
+                    { dxx_gt13,      0.0, dxx_gt33 },
+                },
+                {
+                    {      0.0, zaxis ? 0.5 * (dxx_gt11 - dxx_gt22) : (dx_gt11 - dx_gt22) / x - (gtxx - gtyy) / SQR(x), 0.0 },
+                    { zaxis ? 0.5 * (dxx_gt11 - dxx_gt22) : (dx_gt11 - dx_gt22) / x - (gtxx - gtyy) / SQR(x), 0.0,
+                        zaxis ? 0.5 * dxx_gt13 : dx_gt13 / x - gtxz / SQR(x) },
+                    { 0.0, zaxis ? 0.5 * dxx_gt13 : dx_gt13 / x - gtxz / SQR(x), 0.0 },
+                },
+                {
+                    { dxz_gt11,     0.0, dxz_gt13 },
+                    { 0.0,     dxz_gt22,      0.0 },
+                    { dxz_gt13,     0.0, dxz_gt33 },
+                },
+
+            },
+            {
+                {
+                    { 0.0, zaxis ? 0.5 * (dxx_gt11 - dxx_gt22) : (dx_gt11 - dx_gt22) / x - (gtxx - gtyy) / SQR(x), 0.0 },
+                    { zaxis ? 0.5 * (dxx_gt11 - dxx_gt22) : (dx_gt11 - dx_gt22) / x - (gtxx - gtyy) / SQR(x), 0.0,
+                        zaxis ? 0.5 * dxx_gt13 : dx_gt13 / x - gtxz / SQR(x) },
+                    { 0.0, zaxis ? 0.5 * dxx_gt13 : dx_gt13 / x - gtxz / SQR(x), 0.0 },
+                },
+                {
+                    { zaxis ? dxx_gt22 : dx_gt11 / x - 2 * (gtxx - gtyy) / SQR(x), 0.0,
+                       zaxis ? 0.5 * dxx_gt13 : dx_gt13 / x - gtxz / SQR(x) },
+                    { 0.0, zaxis ? dxx_gt11 : dx_gt22 / x + 2.0 * (gtxx - gtyy) / SQR(x), 0.0 },
+                    { zaxis ? 0.5 * dxx_gt13 : dx_gt13 / x - gtxz / SQR(x), 0.0, zaxis ? dxx_gt33 : dx_gt33 / x },
+                },
+                {
+                    { 0.0, zaxis ? dxz_gt11 - dxz_gt22 : (dz_gt11 - dz_gt22) / x, 0.0 },
+                    { zaxis ? dxz_gt11 - dxz_gt22 : (dz_gt11 - dz_gt22) / x, 0.0,
+                       zaxis ? dxz_gt13 : dz_gt13 / x },
+                    { 0.0, zaxis ? dxz_gt13 : dz_gt13 / x, 0.0 },
+                },
+
+            },
+            {
+                {
+                    { dxz_gt11,      0.0, dxz_gt13 },
+                    {      0.0, dxz_gt22,      0.0 },
+                    { dxz_gt13,      0.0, dxz_gt33 },
+                },
+                {
+                    { 0.0, zaxis ? dxz_gt11 - dxz_gt22 : (dz_gt11 - dz_gt22) / x, 0.0 },
+                    { zaxis ? dxz_gt11 - dxz_gt22 : (dz_gt11 - dz_gt22) / x, 0.0,
+                       zaxis ? dxz_gt13 : dz_gt13 / x },
+                    { 0.0, zaxis ? dxz_gt13 : dz_gt13 / x, 0.0 },
+                },
+                {
+                    { dzz_gt11,      0.0, dzz_gt13 },
+                    {      0.0, dzz_gt22,      0.0 },
+                    { dzz_gt13,      0.0, dzz_gt33 },
+                },
+
+            },
+        };
+
+        const double Atxx = eq_ctx->interp_values[I_ATXX][i];
+        const double Atyy = eq_ctx->interp_values[I_ATYY][i];
+        const double Atzz = eq_ctx->interp_values[I_ATZZ][i];
+        const double Atxy = eq_ctx->interp_values[I_ATXY][i];
+        const double Atxz = eq_ctx->interp_values[I_ATXZ][i];
+        const double Atyz = eq_ctx->interp_values[I_ATYZ][i];
+
+        const double trK  = eq_ctx->interp_values[I_TRK][i];
+
+        const double dx_trK = eq_ctx->interp_values[I_TRK_DX][i];
+        const double dz_trK = eq_ctx->interp_values[I_TRK_DZ][i];
+
+        const double dtrK[3] = { dx_trK, 0.0, dz_trK };
+
+        const double dx_At11 = eq_ctx->interp_values[I_ATXX_DX][i];
+        const double dx_At22 = eq_ctx->interp_values[I_ATYY_DX][i];
+        const double dx_At33 = eq_ctx->interp_values[I_ATZZ_DX][i];
+        const double dx_At13 = eq_ctx->interp_values[I_ATXZ_DX][i];
+
+        const double dz_At11 = eq_ctx->interp_values[I_ATXX_DZ][i];
+        const double dz_At22 = eq_ctx->interp_values[I_ATYY_DZ][i];
+        const double dz_At33 = eq_ctx->interp_values[I_ATZZ_DZ][i];
+        const double dz_At13 = eq_ctx->interp_values[I_ATXZ_DZ][i];
+
+        const double dAt[3][3][3] = {
+            {
+                { dx_At11,     0.0, dx_At13 },
+                {     0.0, dx_At22,     0.0 },
+                { dx_At13,     0.0, dx_At33 },
+            },
+            {
+                {     0.0, zaxis ? dx_At11 - dx_At22 : (Atxx - Atyy) / x, 0.0 },
+                { zaxis ? dx_At11 - dx_At22 : (Atxx - Atyy) / x, 0.0, zaxis ? dx_At13 : Atxz / x },
+                { 0.0, zaxis ? dx_At13 : Atxz / x, 0.0 },
+            },
+            {
+                { dz_At11,     0.0, dz_At13 },
+                {     0.0, dz_At22,     0.0 },
+                { dz_At13,     0.0, dz_At33 },
+            },
+        };
+
+        const double phi    = eq_ctx->interp_values[I_PHI][i];
+
+        const double phi_dx = eq_ctx->interp_values[I_PHI_DX][i];
+        const double phi_dz = eq_ctx->interp_values[I_PHI_DZ][i];
+
+        const double dphi[3] = { phi_dx, 0.0, phi_dz };
+
+        const double phi_dxx = eq_ctx->interp_values[I_PHI_DXX][i];
+        const double phi_dzz = eq_ctx->interp_values[I_PHI_DZZ][i];
+        const double phi_dxz = eq_ctx->interp_values[I_PHI_DXZ][i];
+
+        const double d2phi[3][3] = {
+            { phi_dxx, 0.0, phi_dxz },
+            {     0.0, zaxis ? phi_dxx : phi_dx / x, 0.0 },
+            { phi_dxz,    0.0,  phi_dzz },
+        };
+
+        const double At[3][3] = {{ Atxx, Atxy, Atxz },
+                                 { Atxy, Atyy, Atyz },
+                                 { Atxz, Atyz, Atzz }};
+
+        const double alpha     = eq_ctx->interp_values[I_ALPHA][i];
+        const double dx_alpha  = eq_ctx->interp_values[I_ALPHA_DX][i];
+        const double dz_alpha  = eq_ctx->interp_values[I_ALPHA_DZ][i];
+
+        const double dalpha[3] = { dx_alpha, 0.0, dz_alpha };
+
+        const double Xtx  = eq_ctx->interp_values[I_XTX][i];
+        const double Xtz  = eq_ctx->interp_values[I_XTZ][i];
+
+        const double Xt[3] = { Xtx, 0.0, Xtz };
+
+        const double det = gtxx * gtyy * gtzz + 2 * gtxy * gtyz * gtxz - gtzz * SQR(gtxy) - SQR(gtxz) * gtyy - gtxx * SQR(gtyz);
+
+        // \tilde{γ}^{ij}
+        gtu[0][0] =  (gtyy * gtzz - SQR(gtyz)) / det;
+        gtu[1][1] =  (gtxx * gtzz - SQR(gtxz)) / det;
+        gtu[2][2] =  (gtxx * gtyy - SQR(gtxy)) / det;
+        gtu[0][1] = -(gtxy * gtzz - gtyz * gtxz) / det;
+        gtu[0][2] =  (gtxy * gtyz - gtyy * gtxz) / det;
+        gtu[1][2] = -(gtxx * gtyz - gtxy * gtxz) / det;
+        gtu[1][0] = gtu[0][1];
+        gtu[2][0] = gtu[0][2];
+        gtu[2][1] = gtu[1][2];
+
+        //　γ_{jk}/^{jk}
+        for (int j = 0; j < 3; j++)
+            for (int k = 0; k < 3; k++) {
+                gu[j][k] = SQR(phi) * gtu[j][k];
+                g[j][k]  = gt[j][k] / SQR(phi);
+            }
+
+        // ∂_j γ_{kl}
+        for (int j = 0; j < 3; j++)
+            for (int k = 0; k < 3; k++)
+                for (int l = 0; l < 3; l++) {
+                    dg[j][k][l] = -2.0 * dphi[j] * gt[k][l] / (phi * SQR(phi)) + dgt[j][k][l] / SQR(phi);
+                    dA[j][k][l] = -2.0 * dphi[j] * At[k][l] / (phi * SQR(phi)) + dAt[j][k][l] / SQR(phi);
+                }
+
+        // ∂_j \tilde{γ}^{kl}
+        for (int j = 0; j < 3; j++)
+            for (int k = 0; k < 3; k++)
+                for (int l = 0; l < 3; l++) {
+                    double val = 0.0;
+                    for (int m = 0; m < 3; m++)
+                        for (int n = 0; n < 3; n++)
+                            val += -gtu[k][m] * gtu[l][n] * dgt[j][m][n];
+                    dgtu[j][k][l] = val;
+                }
+
+        // ∂_j γ^{kl}
+        for (int j = 0; j < 3; j++)
+            for (int k = 0; k < 3; k++)
+                for (int l = 0; l < 3; l++) {
+                    double val = 0.0;
+                    for (int m = 0; m < 3; m++)
+                        for (int n = 0; n < 3; n++)
+                            val += -gu[k][m] * gu[l][n] * dg[j][m][n];
+                    dgu[j][k][l] = val;
+                }
+
+        // ∂_{jk} g_{lm}
+        for (int j = 0; j < 3; j++)
+            for (int k = 0; k < 3; k++)
+                for (int l = 0; l < 3; l++)
+                    for (int m = 0; m < 3; m++) {
+                        d2g[j][k][l][m] = 6.0 *  gt      [l][m] * dphi[j] * dphi[k] / SQR(SQR(phi))    -
+                                          2.0 *  gt      [l][m] * d2phi[j][k]       / (phi * SQR(phi)) -
+                                          2.0 * dgt   [j][l][m] * dphi[k]           / (phi * SQR(phi)) -
+                                          2.0 * dgt   [k][l][m] * dphi[j]           / (phi * SQR(phi)) +
+                                               d2gt[j][k][l][m]                     / SQR(phi);
+                    }
+
+        // \tilde{Γ}^j_{kl}
+        for (int j = 0; j < 3; j++)
+            for (int k = 0; k < 3; k++)
+                for (int l = 0; l < 3; l++) {
+                    double val = 0.0;
+                    for (int m = 0; m < 3; m++)
+                        val += 0.5 * gtu[j][m] * (dgt[k][l][m] + dgt[l][k][m] - dgt[m][k][l]);
+                    Gt[j][k][l] = val;
+                }
+
+        // Γ^j_{kl}
+        for (int j = 0; j < 3; j++)
+            for (int k = 0; k < 3; k++)
+                for (int l = 0; l < 3; l++) {
+                    double val = 0.0;
+                    for (int m = 0; m < 3; m++)
+                        val += 0.5 * gu[j][m] * (dg[k][l][m] + dg[l][k][m] - dg[m][k][l]);
+                    G[j][k][l] = val;
+                }
+
+        // ∂_j Γ^k_{lm}
+        for (int j = 0; j < 3; j++)
+            for (int k = 0; k < 3; k++)
+                for (int l = 0; l < 3; l++)
+                    for (int m = 0; m < 3; m++) {
+                        double val = 0.0;
+                        for (int n = 0; n < 3; n++) {
+                            val += dgu[j][k][n] * (dg    [l][m][n] +  dg   [m][l][n] -  dg   [n][l][m]) +
+                                    gu   [k][n] * (d2g[j][l][m][n] + d2g[j][m][l][n] - d2g[j][n][l][m]);
+                        }
+                        dG[j][k][l][m] = 0.5 * val;
+                    }
+
+        // ∂_j Γ^k
+        for (int j = 0; j < 3; j++)
+            for (int k = 0; k < 3; k++) {
+                double val = 0.0;
+                for (int l = 0; l < 3; l++)
+                    for (int m = 0; m < 3; m++)
+                        val += gtu[l][m] * dG[j][k][l][m] + dgtu[j][l][m] * G[k][l][m];
+                dXt[j][k] = val;
+            }
+
+        // Ric_{jk}
+        for (int j = 0; j < 3; j++)
+            for (int k = 0; k < 3; k++) {
+                double val = 0.0;
+                for (int m = 0; m < 3; m++)
+                    val += dG[m][m][j][k] - dG[k][m][j][m];
+                for (int m = 0; m < 3; m++)
+                    for (int l = 0; l < 3; l++)
+                        val += G[l][l][m] * G[m][j][k] - G[l][k][m] * G[m][j][l];
+                Ric[j][k] = val;
+            }
+
+        // Ric^j_k
+        for (int j = 0; j < 3; j++)
+            for (int k = 0; k < 3; k++) {
+                double val = 0.0;
+                for (int l = 0; l < 3; l++)
+                    val += gu[j][l] * Ric[l][k];
+                Ricm[j][k] = val;
+            }
+
+        // A_{jk}
+        for (int j = 0; j < 3; j++)
+            for (int k = 0; k < 3; k++) {
+                A[j][k] = At[j][k] / SQR(phi);
+            }
+
+        // d_j A^{kl}
+        for (int j = 0; j < 3; j++)
+            for (int k = 0; k < 3; k++)
+                for (int l = 0; l < 3; l++) {
+                    double val = 0.0;
+                    for (int m = 0; m < 3; m++)
+                        for (int n = 0; n < 3; n++)
+                            val += dgu[j][k][m] * gu[l][n] * A[m][n] + gu[k][m] * dgu[j][l][n] * A[m][n] + gu[k][m] * gu[l][n] * dA[j][m][n];
+                    dAu[j][k][l] = val;
+                }
+
+        // \tilde{A}^{jk}
+        for (int j = 0; j < 3; j++)
+            for (int k = 0; k < 3; k++) {
+                double val = 0.0;
+                for (int l = 0; l < 3; l++)
+                    for (int m = 0; m < 3; m++)
+                        val += gtu[j][l] * gtu[k][m] * At[l][m];
+                Atu[j][k] = val;
+            }
+
+        // A^{jk}
+        for (int j = 0; j < 3; j++)
+            for (int k = 0; k < 3; k++) {
+                double val = 0.0;
+                for (int l = 0; l < 3; l++)
+                    for (int m = 0; m < 3; m++)
+                        val += gu[j][l] * gu[k][m] * A[l][m];
+                Au[j][k] = val;
+            }
+
+        rhs_x = 0.0;
+        rhs_z = 0.0;
+        for (int j = 0; j < 3; j++) {
+            rhs_x += dalpha[j] * Atu[0][j];
+            rhs_z += dalpha[j] * Atu[2][j];
+        }
+        double val_x = 0.0;
+        double val_z = 0.0;
+        for (int j = 0; j < 3; j++) {
+            for (int k = 0; k < 3; k++) {
+                val_x += -Gt[0][j][k] * Atu[j][k];
+                val_z += -Gt[2][j][k] * Atu[j][k];
+            }
+        }
+        rhs_x += val_x * alpha;
+        rhs_z += val_z * alpha;
+        for (int j = 0; j < 3; j++) {
+            rhs_x += alpha * (2.0 / 3.0) * gtu[0][j] * dtrK[j];
+            rhs_z += alpha * (2.0 / 3.0) * gtu[2][j] * dtrK[j];
+        }
+        for (int j = 0; j < 3; j++) {
+            rhs_x += alpha * 3.0 * Atu[0][j] * dphi[j]/ phi;
+            rhs_z += alpha * 3.0 * Atu[2][j] * dphi[j]/ phi;
+        }
+
+        rhs_x *= 2.0;
+        rhs_z *= 2.0;
+
+        double X[3] = { 0.0 };
+        for (int j = 0; j < 3; j++)
+            for (int k = 0; k < 3; k++) {
+                X[0] += gu[j][k] * G[0][j][k];
+                X[2] += gu[j][k] * G[2][j][k];
+            }
+
+        if (EQUATION == 0) {
+            /* eq 0 */
+            /* ∂_{xx}β^x */
+            eq_ctx->eq_coeffs[0][PSSOLVE_DIFF_ORDER_20][i] = gtu[0][0] + c1o3 * gtu[0][0] + (zaxis ? 0.5 * (gtu[1][1] + c1o3 * gtu[0][0]) : 0.0);
+            /* ∂_{xx}β^z */
+            eq_ctx->eq_coeffs[1][PSSOLVE_DIFF_ORDER_20][i] = 0.0;
+            /* ∂_{zz}β^x */
+            eq_ctx->eq_coeffs[0][PSSOLVE_DIFF_ORDER_02][i] = gtu[2][2];
+            /* ∂_{zz}β^z */
+            eq_ctx->eq_coeffs[1][PSSOLVE_DIFF_ORDER_02][i] = c1o3 * gtu[0][2];
+
+            /* ∂_{xz}β^x */
+            eq_ctx->eq_coeffs[0][PSSOLVE_DIFF_ORDER_11][i] = 2.0 * gtu[0][2] + c1o3 * gtu[0][2] + (zaxis ? c1o3 * gtu[0][2] : 0.0);
+            /* ∂_{xz}β^z */
+            eq_ctx->eq_coeffs[1][PSSOLVE_DIFF_ORDER_11][i] = c1o3 * gtu[0][0];
+
+            /* ∂_{x}β^x */
+            eq_ctx->eq_coeffs[0][PSSOLVE_DIFF_ORDER_10][i] = -Xt[0] + (2.0 / 3.0) * Xt[0] + (zaxis ? (2.0 / 3.0) * Xt[0] : (gtu[1][1] + c1o3 * gtu[0][0]) / x);
+            /* ∂_{x}β^z */
+            eq_ctx->eq_coeffs[1][PSSOLVE_DIFF_ORDER_10][i] = 0.0;
+
+            /* ∂_{z}β^x */
+            eq_ctx->eq_coeffs[0][PSSOLVE_DIFF_ORDER_01][i] = -Xt[2] + (zaxis ? 0.0 : c1o3 * gtu[0][2] / x);
+            /* ∂_{z}β^z */
+            eq_ctx->eq_coeffs[1][PSSOLVE_DIFF_ORDER_01][i] = (2.0 / 3.0) * Xt[0];
+
+            /* β^x */
+            eq_ctx->eq_coeffs[0][PSSOLVE_DIFF_ORDER_00][i] = dXt[0][0] + (zaxis ? 0.0 : (2.0 / 3.0) * Xt[0] / x - (gtu[1][1] + c1o3 * gtu[0][0]) / SQR(x));
+
+            /* β^z */
+            eq_ctx->eq_coeffs[1][PSSOLVE_DIFF_ORDER_00][i] = dXt[2][0];
+
+            eq_ctx->rhs[i]                                 = rhs_x;
+        } else {
+            /* eq 1 */
+            /* ∂_{xx}β^x */
+            eq_ctx->eq_coeffs[0][PSSOLVE_DIFF_ORDER_20][i] = c1o3 * gtu[2][0] + (zaxis ? 0.5 * c1o3 * gtu[2][0] : 0.0);
+            /* ∂_{xx}β^z */
+            eq_ctx->eq_coeffs[1][PSSOLVE_DIFF_ORDER_20][i] = gtu[0][0] + (zaxis ? gtu[1][1] : 0.0);
+            /* ∂_{zz}β^x */
+            eq_ctx->eq_coeffs[0][PSSOLVE_DIFF_ORDER_02][i] = 0.0;
+            /* ∂_{zz}β^z */
+            eq_ctx->eq_coeffs[1][PSSOLVE_DIFF_ORDER_02][i] = gtu[2][2] + c1o3 * gtu[2][2];
+            /* ∂_{xz}β^x */
+            eq_ctx->eq_coeffs[0][PSSOLVE_DIFF_ORDER_11][i] = c1o3 * gtu[2][2] + (zaxis ? c1o3 * gtu[2][2] : 0.0);
+            /* ∂_{xz}β^z */
+            eq_ctx->eq_coeffs[1][PSSOLVE_DIFF_ORDER_11][i] = 2.0 * gtu[0][2] + c1o3 * gtu[0][2];
+
+            /* ∂_{x}β^x */
+            eq_ctx->eq_coeffs[0][PSSOLVE_DIFF_ORDER_10][i] = (2.0 / 3.0) * Xt[2] + (zaxis ? (2.0 / 3.0) * Xt[2] : c1o3 * gtu[2][0] / x);
+            /* ∂_{x}β^z */
+            eq_ctx->eq_coeffs[1][PSSOLVE_DIFF_ORDER_10][i] = -Xt[0] + (zaxis ? 0.0 : gtu[1][1] / x);
+            /* ∂_{z}β^x */
+            eq_ctx->eq_coeffs[0][PSSOLVE_DIFF_ORDER_01][i] = (zaxis ? 0.0 : c1o3 * gtu[2][2] / x);
+            /* ∂_{z}β^z */
+            eq_ctx->eq_coeffs[1][PSSOLVE_DIFF_ORDER_01][i] = -Xt[2] + (2.0 / 3.0) * Xt[2];
+
+            /* β^x */
+            eq_ctx->eq_coeffs[0][PSSOLVE_DIFF_ORDER_00][i] = dXt[0][2] + (zaxis ? 0.0 : (2.0 / 3.0) * Xt[2] / x - c1o3 * gtu[2][0] / SQR(x));
+
+            /* β^z */
+            eq_ctx->eq_coeffs[1][PSSOLVE_DIFF_ORDER_00][i] = dXt[2][2];
+
+            eq_ctx->rhs[i] = rhs_z;
+        }
+    }
+}
diff --git a/src/make.code.defn b/src/make.code.defn
new file mode 100644
index 0000000..cc89085
--- /dev/null
+++ b/src/make.code.defn
@@ -0,0 +1,7 @@
+# Main make.code.defn file for thorn MaximalSlicingAxi
+
+# Source files in this directory
+SRCS = basis.c bicgstab.c md.c md_solve.c pssolve.c expansion.asm threadpool.c register.c
+
+# Subdirectories containing source files
+SUBDIRS = 
diff --git a/src/md.c b/src/md.c
new file mode 100644
index 0000000..21e38fc
--- /dev/null
+++ b/src/md.c
@@ -0,0 +1,573 @@
+#include "common.h"
+
+#include <ctype.h>
+#include <errno.h>
+#include <float.h>
+#include <inttypes.h>
+#include <limits.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <cblas.h>
+
+#include "cctk.h"
+#include "cctk_Arguments.h"
+#include "cctk_Parameters.h"
+#include "cctk_Timers.h"
+#include "util_Table.h"
+
+#include "md.h"
+#include "md_solve.h"
+#include "threadpool.h"
+
+typedef struct EvalContext {
+    struct MDContext *md;
+    struct CoordPatch *cp;
+    const double *x;
+    const double *z;
+    double *W;
+
+    const double *coeffs;
+    double nb_coeffs[2];
+
+    double *eval_tmp[2];
+
+    unsigned int x_idx_start;
+    unsigned int x_idx_end;
+    unsigned int z_idx_start;
+    unsigned int z_idx_end;
+} EvalContext;
+
+/* precomputed values for a given refined grid */
+typedef struct CoordPatch {
+    CCTK_REAL origin[3];
+    CCTK_INT delta[3];
+    CCTK_INT size[3];
+
+    // basis values on the grid
+    double *basis_val_r;
+    double *basis_val_z;
+
+    double *transform_z;
+    double *transform_matrix;
+    double *transform_matrix1;
+    double *transform_matrix2;
+    double *transform_matrix3;
+    double *transform_tmp;
+
+    int y_idx;
+
+    int nb_threads;
+    ThreadPoolContext *tp;
+    EvalContext *ec;
+} CoordPatch;
+
+struct MDContext {
+    MDSolver *solver;
+    cGH *gh;
+    ThreadPoolContext *tp;
+
+    struct {
+        double time;
+        double *coeffs;
+    } solution_cache[8];
+    int nb_solutions;
+
+    double *coeffs_eval;
+
+    uint64_t grid_expand_count;
+    uint64_t grid_expand_time;
+
+    CoordPatch *patches;
+    int nb_patches;
+};
+
+/* get an approximate "main" frequency component in a basis function */
+static double calc_basis_freq(const MDBasisSetContext *b, int order)
+{
+    return md_basis_colloc_point(b, order, 1);
+}
+
+static CoordPatch *get_coord_patch(MDContext *md,
+                                   CCTK_REAL *x, CCTK_REAL *y, CCTK_REAL *z,
+                                   double scale_factor, double scale_power)
+{
+    cGH *cctkGH = md->gh;
+
+    CoordPatch *cp;
+    int64_t grid_size;
+    int i, block_size;
+    const char *nb_threads;
+
+    for (int i = 0; i < md->nb_patches; i++) {
+        cp = &md->patches[i];
+
+        if (cp->origin[0] == md->gh->cctk_origin_space[0] &&
+            cp->origin[1] == md->gh->cctk_origin_space[1] &&
+            cp->origin[2] == md->gh->cctk_origin_space[2] &&
+            cp->size[0]   == md->gh->cctk_lsh[0]          &&
+            cp->size[1]   == md->gh->cctk_lsh[1]          &&
+            cp->size[2]   == md->gh->cctk_lsh[2]          &&
+            cp->delta[0]  == md->gh->cctk_levfac[0]  &&
+            cp->delta[1]  == md->gh->cctk_levfac[1]  &&
+            cp->delta[2]  == md->gh->cctk_levfac[2])
+            return cp;
+    }
+
+    grid_size = cctkGH->cctk_lsh[0] * cctkGH->cctk_lsh[1] * cctkGH->cctk_lsh[2];
+
+    /* create a new patch */
+    md->patches = realloc(md->patches, sizeof(*md->patches) * (md->nb_patches + 1));
+    cp = &md->patches[md->nb_patches];
+
+    memset(cp, 0, sizeof(*cp));
+
+    memcpy(cp->origin, md->gh->cctk_origin_space, sizeof(cp->origin));
+    memcpy(cp->size,   md->gh->cctk_lsh,          sizeof(cp->size));
+    memcpy(cp->delta,  md->gh->cctk_levfac,  sizeof(cp->delta));
+
+    for (i = 0; i < cp->size[1]; i++)
+        if (fabs(y[CCTK_GFINDEX3D(cctkGH, 0, i, 0)]) < 1e-8) {
+            cp->y_idx = i;
+            break;
+        }
+    if (i == cp->size[1])
+        CCTK_WARN(0, "The grid does not include y==0");
+
+#if MD_POLAR || 1
+    posix_memalign((void**)&cp->transform_matrix,  32, sizeof(*cp->transform_matrix)  * md->solver->nb_coeffs[0] * cp->size[0] * cp->size[2]);
+    posix_memalign((void**)&cp->transform_matrix1, 32, sizeof(*cp->transform_matrix1) * md->solver->nb_coeffs[1] * cp->size[0] * cp->size[2]);
+    posix_memalign((void**)&cp->transform_matrix2, 32, sizeof(*cp->transform_matrix2) * md->solver->nb_coeffs[0] * cp->size[0] * cp->size[2]);
+    posix_memalign((void**)&cp->transform_matrix3, 32, sizeof(*cp->transform_matrix3) * md->solver->nb_coeffs[1] * cp->size[0] * cp->size[2]);
+#pragma omp parallel for
+    for (int j = 0; j < cp->size[2]; j++) {
+        double zz = z[CCTK_GFINDEX3D(md->gh, 0, 0, j)];
+
+        for (int i = 0; i < cp->size[0]; i++) {
+            const int idx_grid = j * cp->size[0] + i;
+
+            double xx = x[CCTK_GFINDEX3D(md->gh, i, 0, 0)];
+            double rr = sqrt(SQR(xx) + SQR(zz));
+
+            double coord0 = xx;
+            double coord1 = zz;
+
+            //for (int k = 0; k < md->nb_coeffs_z; k++)
+            //    for (int l = 0; l < md->nb_coeffs_x; l++) {
+            //        const int idx_coeff = k * md->nb_coeffs_x + l;
+            //        cp->transform_matrix[idx_grid + cp->size[0] * cp->size[2] * idx_coeff] = md->basis->eval(r, l) * md->basis1->eval(phi, k);
+            //    }
+            for (int k = 0; k < md->solver->nb_coeffs[0]; k++) {
+                double dx = calc_basis_freq(md->solver->basis[0][0], k);
+                double r0 = MIN(60.0, dx * scale_factor);
+                double fact =  exp(-36.0 * pow(rr / r0, scale_power));
+
+                cp->transform_matrix[idx_grid + cp->size[0] * cp->size[2] * k] = md_basis_eval(md->solver->basis[0][0], MD_BASIS_EVAL_TYPE_VALUE, coord0, k) * fact;
+            }
+            for (int k = 0; k < md->solver->nb_coeffs[1]; k++) {
+                double dx = calc_basis_freq(md->solver->basis[0][1], k);
+                double r0 = MIN(60.0, dx * scale_factor);
+                double fact = exp(-36.0 * pow(rr / r0, scale_power));
+
+                cp->transform_matrix1[idx_grid * md->solver->nb_coeffs[1] + k] = md_basis_eval(md->solver->basis[0][1], MD_BASIS_EVAL_TYPE_VALUE, coord1, k) * fact;
+            }
+            for (int k = 0; k < md->solver->nb_coeffs[0]; k++) {
+                double dx = calc_basis_freq(md->solver->basis[1][0], k);
+                double r0 = MIN(60.0, dx * scale_factor);
+                double fact =  exp(-36.0 * pow(rr / r0, scale_power));
+
+                cp->transform_matrix2[idx_grid + cp->size[0] * cp->size[2] * k] = md_basis_eval(md->solver->basis[1][0], MD_BASIS_EVAL_TYPE_VALUE, coord0, k) * fact;
+            }
+            for (int k = 0; k < md->solver->nb_coeffs[1]; k++) {
+                double dx = calc_basis_freq(md->solver->basis[1][1], k);
+                double r0 = MIN(60.0, dx * scale_factor);
+                double fact = exp(-36.0 * pow(rr / r0, scale_power));
+
+                cp->transform_matrix3[idx_grid * md->solver->nb_coeffs[1] + k] = md_basis_eval(md->solver->basis[1][1], MD_BASIS_EVAL_TYPE_VALUE, coord1, k) * fact;
+            }
+        }
+    }
+    posix_memalign((void**)&cp->transform_tmp,  32, sizeof(*cp->transform_tmp)  * cp->size[0] * cp->size[2] * md->solver->nb_coeffs[1]);
+#else
+    posix_memalign((void**)&cp->basis_val_r,   32, sizeof(*cp->basis_val_r)   * md->solver->nb_coeffs[0] * md->gh->cctk_lsh[1] * md->gh->cctk_lsh[0]);
+    for (int j = 0; j < md->gh->cctk_lsh[1]; j++)
+        for (int i = 0; i < md->gh->cctk_lsh[0]; i++) {
+            CCTK_REAL xx = x[CCTK_GFINDEX3D(md->gh, i, j, 0)];
+            CCTK_REAL yy = y[CCTK_GFINDEX3D(md->gh, i, j, 0)];
+            CCTK_REAL r = sqrt(SQR(xx) + SQR(yy));
+
+            for (int k = 0; k < md->solver->nb_coeffs[0]; k++)
+                //cp->basis_val_r  [(j * md->gh->cctk_lsh[0] + i) * md->nb_coeffs_x + k] = md->basis->eval(r, k);
+                cp->basis_val_r  [(j * md->gh->cctk_lsh[0] + i) + md->gh->cctk_lsh[1] * md->gh->cctk_lsh[0] * k] = md->solver->basis[0]->eval(r, k);
+        }
+
+    posix_memalign((void**)&cp->basis_val_z,   32, sizeof(*cp->basis_val_z) * md->solver->nb_coeffs[1] * md->gh->cctk_lsh[2]);
+    for (int i = 0; i < md->gh->cctk_lsh[2]; i++) {
+        CCTK_REAL zz = z[CCTK_GFINDEX3D(md->gh, 0, 0, i)];
+        for (int j = 0; j < md->solver->nb_coeffs[1]; j++)
+            cp->basis_val_z  [i * md->solver->nb_coeffs[1] + j] = md->solver->basis[0]->eval(fabs(zz), j);
+            //cp->basis_val_z  [i + md->gh->cctk_lsh[2] * j] = md->basis->eval(zz, j);
+    }
+    posix_memalign((void**)&cp->transform_z, 32, sizeof(*cp->transform_z) * cctkGH->cctk_lsh[2] * md->solver->nb_coeffs[0]);
+#endif
+
+#if 0
+    nb_threads = getenv("OMP_NUM_THREADS");
+    if (nb_threads)
+        cp->nb_threads = atoi(nb_threads);
+    if (cp->nb_threads <= 0)
+        cp->nb_threads = 1;
+    md_threadpool_init(&cp->tp, cp->nb_threads);
+    cp->ec = calloc(cp->nb_threads, sizeof(*cp->ec));
+
+    block_size = (md->gh->cctk_lsh[2] + cp->nb_threads - 1) / cp->nb_threads;
+
+    for (int i = 0; i < cp->nb_threads; i++) {
+        EvalContext *ec = &cp->ec[i];
+
+        ec->md = md;
+
+        ec->nb_coeffs[0] = md->solver->nb_coeffs[0];
+        ec->nb_coeffs[1] = md->solver->nb_coeffs[1];
+
+        posix_memalign((void**)&ec->eval_tmp[0], 32, sizeof(*ec->eval_tmp[0]) * ec->nb_coeffs[0]);
+        posix_memalign((void**)&ec->eval_tmp[1], 32, sizeof(*ec->eval_tmp[1]) * ec->nb_coeffs[1]);
+
+        ec->x_idx_start = 0;
+        ec->x_idx_end = md->gh->cctk_lsh[0];
+
+        ec->z_idx_start = block_size * i;
+        ec->z_idx_end   = MIN(block_size * (i + 1), md->gh->cctk_lsh[2]);
+    }
+#endif
+
+    md->nb_patches++;
+    return cp;
+}
+
+static MDContext *md_context;
+
+static int context_init(cGH *cctkGH)
+{
+    int threads_type;
+    const int *threads = CCTK_ParameterGet("num_threads", "Carpet", &threads_type);
+
+    MDContext *md;
+    int ret;
+
+    DECLARE_CCTK_ARGUMENTS;
+    DECLARE_CCTK_PARAMETERS;
+
+    md = calloc(1, sizeof(*md));
+    if (!md)
+        return -ENOMEM;
+
+    md->gh = cctkGH;
+
+    ret = md_threadpool_init(&md->tp, *threads);
+    if (ret < 0)
+        return ret;
+
+    ret = md_solver_init(&md->solver, cctkGH, md->tp, 2,
+                         (unsigned int [2][2]){ { basis_order_r, basis_order_z },
+                                                { basis_order_r, basis_order_z }},
+                         scale_factor, filter_power, 0.0);
+    if (ret < 0)
+        return ret;
+
+    ret = posix_memalign((void**)&md->coeffs_eval, 32,
+                         basis_order_r * basis_order_z * sizeof(*md->coeffs_eval));
+    if (ret)
+        return -ENOMEM;
+
+    for (int i = 0; i < ARRAY_ELEMS(md->solution_cache); i++) {
+        ret = posix_memalign((void**)&md->solution_cache[i].coeffs, 32,
+                             2 * basis_order_r * basis_order_z * sizeof(*md->solution_cache[i].coeffs));
+        if (ret)
+            return -ENOMEM;
+    }
+
+    md_context = md;
+
+    return 0;
+}
+
+void minimal_distortion_solve(CCTK_ARGUMENTS)
+{
+    MDContext *md;
+
+    DECLARE_CCTK_ARGUMENTS;
+    DECLARE_CCTK_PARAMETERS;
+
+    double time;
+
+    if (!md_context)
+        context_init(cctkGH);
+
+    md = md_context;
+
+    time = cctkGH->cctk_time / md->gh->cctk_delta_time;
+
+    //if (md->gh->cctk_levfac[0] != 1 || fabs(time - ceilf(time)) > 1e-8 ||
+    //    (md->nb_solutions && md->solution_cache[md->nb_solutions - 1].time == cctkGH->cctk_time))
+    //    return;
+    //if (md->gh->cctk_time < 10.0)
+    //    return;
+
+    CCTK_TimerStart("MinimalDistortion_Solve");
+    md_solver_solve(md->solver);
+    CCTK_TimerStop("MinimalDistortion_Solve");
+
+    fprintf(stderr, "%d md solve: time %g %g %g\n", md->gh->cctk_levfac[0], md->gh->cctk_time, time, md->solver->coeffs[0]);
+    if (1) {
+        double *tmp;
+        if (md->nb_solutions == ARRAY_ELEMS(md->solution_cache)) {
+            tmp = md->solution_cache[0].coeffs;
+            memmove(md->solution_cache, md->solution_cache + 1, sizeof(md->solution_cache[0]) * (ARRAY_ELEMS(md->solution_cache) - 1));
+        } else {
+            md->nb_solutions++;
+            tmp = md->solution_cache[md->nb_solutions - 1].coeffs;
+        }
+        md->solution_cache[md->nb_solutions - 1].coeffs = md->solver->coeffs;
+        md->solution_cache[md->nb_solutions - 1].time   = md->gh->cctk_time;
+
+        md->solver->coeffs = tmp;
+    }
+}
+
+double md_scalarproduct_metric_avx(size_t len1, size_t len2, const double *mat,
+                                    const double *vec1, const double *vec2);
+
+static double md_scalarproduct_metric_c(size_t len1, size_t len2, double *mat,
+                                         double *vec1, double *vec2)
+{
+    double val = 0.0;
+    for (int l = 0; l < len2; l++) {
+        double tmp = 0.0;
+        for (int m = 0; m < len1; m++)
+            tmp += mat[l * len1 + m] * vec1[m];
+
+        val += tmp * vec2[l];
+    }
+    return val;
+}
+
+#if 0
+static void md_eval(void *arg,
+                     unsigned int job_id, unsigned int nb_jobs,
+                     unsigned int thread_idx, unsigned int nb_threads)
+{
+    EvalContext  *e = (EvalContext*)arg + job_id;
+    CoordPatch  *cp = e->cp;
+    MDContext *md = e->md;
+    const cGH   *gh = e->md->gh;
+    double       *W = e->W;
+
+    for (int k = e->z_idx_start; k < e->z_idx_end; k++) {
+        for (int i = e->x_idx_start; i < e->x_idx_end; i++) {
+            int idx = CCTK_GFINDEX3D(gh, i, cp->y_idx, k);
+            double xx = e->x[idx];
+            double zz = e->z[idx];
+            double r = sqrt(SQR(xx) + SQR(zz));
+            double phi = atan2(zz, xx);
+
+            double *basis_vec1 = e->eval_tmp[0];
+            double *basis_vec2 = e->eval_tmp[1];
+
+            for (int l = 0; l < e->nb_coeffs[0]; l++)
+                basis_vec1[l] = md->solver->basis[0]->eval(r, l);
+            for (int l = 0; l < e->nb_coeffs[0]; l++)
+                basis_vec2[l] = md->solver->basis[1]->eval(phi, l);
+
+            W[idx] = md_scalarproduct_metric_avx(e->nb_coeffs[0], e->nb_coeffs[1], e->coeffs,
+                                                 basis_vec1, basis_vec2);
+        }
+    }
+}
+#endif
+
+void minimal_distortion_eval(CCTK_ARGUMENTS)
+{
+    MDContext *md;
+
+    CoordPatch *cp;
+
+    DECLARE_CCTK_ARGUMENTS;
+    DECLARE_CCTK_PARAMETERS;
+
+    double *beta1  = CCTK_VarDataPtr(cctkGH, 0, "ML_BSSN::beta1");
+    double *beta3  = CCTK_VarDataPtr(cctkGH, 0, "ML_BSSN::beta3");
+
+    double time;
+
+    int64_t expand_start;
+
+    double *coeffs = NULL;
+    int i, ret;
+
+    if (!md_context)
+        context_init(cctkGH);
+
+    time  = cctkGH->cctk_time;
+
+    md = md_context;
+
+    cp = get_coord_patch(md, x, y, z, scale_factor, scale_power);
+
+#if 1
+    //coeffs = md->coeffs;
+    coeffs = md->solution_cache[md->nb_solutions - 1].coeffs;
+#elif 0
+    if (time < 10.0) {
+        return;
+    } else if (time < 11.0) {
+        double fact = exp(-36.0 * pow((10.0 - time), 4.0));
+        double *coeffs_src = md->solution_cache[md->nb_solutions - 1].coeffs;
+
+        coeffs     = md->coeffs_eval;
+        for (int i = 0; i < md->solver->nb_coeffs[0] * md->solver->nb_coeffs[1] * 2; i++)
+            coeffs[i] = coeffs_src[i] * fact;
+    } else
+        coeffs = md->solution_cache[md->nb_solutions - 1].coeffs;
+
+#else
+    coeffs = md->coeffs_eval;
+
+    if (cctkGH->cctk_levfac[0] < 1 || md->nb_solutions < 2) {
+        memset(coeffs, 0, sizeof(*coeffs) * md->solver->nb_coeffs[0] * md->solver->nb_coeffs[1]);
+        //fprintf(stderr, "md eval: time %g zero\n", md->gh->cctk_time);
+    } else {
+        double *coeffs0 = md->solution_cache[md->nb_solutions - 2].coeffs;
+        double *coeffs1 = md->solution_cache[md->nb_solutions - 1].coeffs;
+        double time0 = md->solution_cache[md->nb_solutions - 2].time;
+        double time1 = md->solution_cache[md->nb_solutions - 1].time;
+
+        double fact = 1.0;
+
+        //if (time < 9.0)
+        //    fact = 1.0;
+        //else
+        //    fact = exp(-36.0 * pow((time - 9.0), 4.0));
+        //else if (time < 0.1)
+        //    fact = 0.0;
+        //else
+        //    fact = (1.0 - exp(-pow((time - 0.0) / 0.25, 4.0)));
+        //fact = 1.0;
+
+        //fprintf(stderr, "md eval: time %g interp from %g %g %g\n", md->gh->cctk_time, time0, time1, fact);
+
+        for (int i = 0; i < 2 * md->solver->nb_coeffs[0] * md->solver->nb_coeffs[1]; i++)
+            coeffs[i] = (coeffs1[i] * (time - time0) / (time1 - time0) + coeffs0[i] * (time - time1) / (time0 - time1)) * fact;
+
+    }
+#endif
+
+    if (export_coeffs) {
+        memcpy(betax_coeffs, coeffs, sizeof(*coeffs) * md->solver->nb_coeffs[0] * md->solver->nb_coeffs[1]);
+        memcpy(betaz_coeffs, coeffs + md->solver->nb_coeffs[0] * md->solver->nb_coeffs[1],
+               sizeof(*coeffs) * md->solver->nb_coeffs[0] * md->solver->nb_coeffs[1]);
+    }
+
+    CCTK_TimerStart("MinimalDistortion_Expand");
+    expand_start = gettime();
+#if 0
+#pragma omp parallel for
+    for (int k = 0; k < cctk_lsh[2]; k++) {
+        for (int i = 0; i < cctk_lsh[0]; i++) {
+            int idx = CCTK_GFINDEX3D(cctkGH, i, cp->y_idx, k);
+            double xx = x[idx];
+            double zz = z[idx];
+            double r = sqrt(SQR(xx) + SQR(zz));
+            double phi = atan2(zz, xx);
+
+            double val = 0.0;
+
+            for (int l = 0; l < md->nb_coeffs_z; l++) {
+                double tmp = 0.0;
+                for (int m = 0; m < md->nb_coeffs_x; m++) {
+                    const int idx_coeff = l * md->nb_coeffs_x + m;
+                    tmp += coeffs[idx_coeff] * md->basis->eval(r, m);
+                }
+                val += tmp * md->basis1->eval(phi, l);
+            }
+
+            W[idx] = val;
+        }
+    }
+#elif 0
+    {
+        for (int i = 0; i < cp->nb_threads; i++) {
+            cp->ec[i].cp = cp;
+            cp->ec[i].x = x;
+            cp->ec[i].z = z;
+            cp->ec[i].W = W;
+            cp->ec[i].coeffs = coeffs;
+        }
+        md_threadpool_execute(cp->tp, cp->nb_threads, md_eval, cp->ec);
+    }
+#elif MD_POLAR || 1
+    cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans,
+                cctk_lsh[0] * cctk_lsh[2], md->solver->nb_coeffs[1], md->solver->nb_coeffs[0],
+                1.0, cp->transform_matrix, cctk_lsh[0] * cctk_lsh[2],
+                coeffs, md->solver->nb_coeffs[0], 0.0, cp->transform_tmp, cctk_lsh[0] * cctk_lsh[2]);
+#pragma omp parallel for
+    for (int j = 0; j < cctk_lsh[2]; j++)
+        for (int i = 0; i < cctk_lsh[0]; i++) {
+            const int idx_grid = j * cctk_lsh[0] + i;
+            const double val = cblas_ddot(md->solver->nb_coeffs[1], cp->transform_matrix1 + idx_grid * md->solver->nb_coeffs[1], 1,
+                                          cp->transform_tmp + idx_grid, cctk_lsh[0] * cctk_lsh[2]);
+            beta1[CCTK_GFINDEX3D(cctkGH, i, cp->y_idx, j)] = val;
+        }
+    cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans,
+                cctk_lsh[0] * cctk_lsh[2], md->solver->nb_coeffs[1], md->solver->nb_coeffs[0],
+                1.0, cp->transform_matrix2, cctk_lsh[0] * cctk_lsh[2],
+                coeffs + md->solver->nb_coeffs[0] * md->solver->nb_coeffs[1],
+                md->solver->nb_coeffs[0], 0.0, cp->transform_tmp, cctk_lsh[0] * cctk_lsh[2]);
+#pragma omp parallel for
+    for (int j = 0; j < cctk_lsh[2]; j++)
+        for (int i = 0; i < cctk_lsh[0]; i++) {
+            const int idx_grid = j * cctk_lsh[0] + i;
+            const double val = cblas_ddot(md->solver->nb_coeffs[1], cp->transform_matrix3 + idx_grid * md->solver->nb_coeffs[1], 1,
+                                          cp->transform_tmp + idx_grid, cctk_lsh[0] * cctk_lsh[2]);
+            beta3[CCTK_GFINDEX3D(cctkGH, i, cp->y_idx, j)] = val;
+        }
+#else
+    memset(W, 0, cctk_lsh[0] * cctk_lsh[1] * cctk_lsh[2] * sizeof(*W));
+    cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans,
+                md->solver->nb_coeffs[0], cctk_lsh[2], md->solver->nb_coeffs[1], 1.0,
+                coeffs, md->solver->nb_coeffs[0], cp->basis_val_z, md->solver->nb_coeffs[1],
+                0.0, cp->transform_z, md->solver->nb_coeffs[0]);
+    cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans,
+                cctk_lsh[1] * cctk_lsh[0], cctk_lsh[2], md->solver->nb_coeffs[0], 1.0,
+                cp->basis_val_r, cctk_lsh[0] * cctk_lsh[1], cp->transform_z, md->solver->nb_coeffs[0],
+                1.0, W, cctk_lsh[0] * cctk_lsh[1]);
+#endif
+
+    md->grid_expand_time += gettime() - expand_start;
+    md->grid_expand_count++;
+
+    CCTK_TimerStop("MinimalDistortion_Expand");
+
+    /* print stats */
+    if (!(md->grid_expand_count & 255)) {
+        fprintf(stderr, "Minimal distortion stats:\n");
+
+        md_solver_print_stats(md->solver);
+
+        fprintf(stderr,
+                "%lu evals: total time %g s, avg time per call %g md\n",
+                md->grid_expand_count, (double)md->grid_expand_time / 1e6,
+                (double)md->grid_expand_time / md->grid_expand_count / 1e3);
+    }
+}
+
+void minimal_distortion_init(CCTK_ARGUMENTS)
+{
+    DECLARE_CCTK_ARGUMENTS;
+    DECLARE_CCTK_PARAMETERS;
+
+    if (!md_context)
+        context_init(cctkGH);
+}
diff --git a/src/md.h b/src/md.h
new file mode 100644
index 0000000..0a4a917
--- /dev/null
+++ b/src/md.h
@@ -0,0 +1,19 @@
+#ifndef MD_MD_H
+#define MD_MD_H
+
+#include "common.h"
+
+#if HAVE_OPENCL
+#include <cl.h>
+#endif
+
+#include <inttypes.h>
+
+#include "cctk.h"
+
+#include "md_solve.h"
+#include "threadpool.h"
+
+typedef struct MDContext MDContext;
+
+#endif /* MD_MD_H */
diff --git a/src/md_solve.c b/src/md_solve.c
new file mode 100644
index 0000000..c7fa329
--- /dev/null
+++ b/src/md_solve.c
@@ -0,0 +1,818 @@
+/*
+ * Minimal distortion -- actual solver code
+ * Copyright (C) 2016 Anton Khirnov <anton@khirnov.net>
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "common.h"
+
+#include <errno.h>
+#include <math.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#if HAVE_OPENCL
+#include <cl.h>
+#include <clBLAS.h>
+#endif
+
+#include "cctk.h"
+#include "cctk_Timers.h"
+#include "util_Table.h"
+
+#include "basis.h"
+#include "pssolve.h"
+#include "md_solve.h"
+#include "threadpool.h"
+
+#define NB_COEFFS(md)        (md->nb_coeffs[0]        * md->nb_coeffs[1])
+#define NB_COLLOC_POINTS(md) (md->nb_colloc_points[0] * md->nb_colloc_points[1])
+
+/* indices (in our code, not cactus structs) of the grid functions which we'll need to
+ * interpolate on the pseudospectral grid */
+enum MetricVars {
+    GTXX = 0,
+    GTYY,
+    GTZZ,
+    GTXY,
+    GTXZ,
+    GTYZ,
+    PHI,
+    ATXX,
+    ATYY,
+    ATZZ,
+    ATXY,
+    ATXZ,
+    ATYZ,
+    XTX,
+    XTY,
+    XTZ,
+    ALPHA,
+    TRK,
+    NB_METRIC_VARS,
+};
+
+/* indices of the interpolated values of the above grid functions and their derivatives */
+enum InterpMetricVars {
+    I_GTXX = 0,
+    I_GTYY,
+    I_GTZZ,
+    I_GTXY,
+    I_GTXZ,
+    I_GTYZ,
+    I_GTXX_DX,
+    I_GTYY_DX,
+    I_GTZZ_DX,
+    I_GTXZ_DX,
+    I_GTXX_DZ,
+    I_GTYY_DZ,
+    I_GTZZ_DZ,
+    I_GTXZ_DZ,
+    I_GTXX_DXX,
+    I_GTYY_DXX,
+    I_GTZZ_DXX,
+    I_GTXZ_DXX,
+    I_GTXX_DXZ,
+    I_GTYY_DXZ,
+    I_GTZZ_DXZ,
+    I_GTXZ_DXZ,
+    I_GTXX_DZZ,
+    I_GTYY_DZZ,
+    I_GTZZ_DZZ,
+    I_GTXZ_DZZ,
+    I_PHI,
+    I_PHI_DX,
+    I_PHI_DY,
+    I_PHI_DZ,
+    I_PHI_DXX,
+    I_PHI_DZZ,
+    I_PHI_DXZ,
+    I_ATXX,
+    I_ATYY,
+    I_ATZZ,
+    I_ATXY,
+    I_ATXZ,
+    I_ATYZ,
+    I_ATXX_DX,
+    I_ATYY_DX,
+    I_ATZZ_DX,
+    I_ATXZ_DX,
+    I_ATXX_DZ,
+    I_ATYY_DZ,
+    I_ATZZ_DZ,
+    I_ATXZ_DZ,
+    I_XTX,
+    I_XTY,
+    I_XTZ,
+    I_ALPHA,
+    I_ALPHA_DX,
+    I_ALPHA_DY,
+    I_ALPHA_DZ,
+    I_TRK,
+    I_TRK_DX,
+    I_TRK_DZ,
+    NB_INTERP_VARS,
+};
+
+/* per-equation state */
+typedef struct MDEquationContext {
+    double *interp_coords[3];
+    double *interp_values[NB_INTERP_VARS];
+
+    /* eq_coeffs[i][j] is an array of coefficients at the collocation points
+     * for j-th derivative of i-th unknown function */
+    double *(*eq_coeffs)[PSSOLVE_DIFF_ORDER_NB];
+
+    double *rhs;
+} MDEquationContext;
+
+struct MDSolverPriv {
+    PSSolveContext *ps_ctx;
+    cGH *gh;
+
+    MDEquationContext *eqs;
+
+    int colloc_grid_order[2];
+
+    double *rhs;
+
+    double *coeff_scale;
+
+    // interpolation parameters
+    int coord_system;
+    int interp_operator;
+    int interp_params;
+
+    CCTK_REAL *interp_coords[3];
+
+    int        interp_vars_indices[NB_METRIC_VARS];
+    CCTK_REAL *interp_values[NB_INTERP_VARS];
+    CCTK_INT   interp_value_codes[NB_INTERP_VARS];
+
+#if HAVE_OPENCL
+    // OpenCL / CLBLAS stuff
+    cl_context       ocl_ctx;
+    cl_command_queue ocl_queue;
+#endif
+
+    ThreadPoolContext *tp;
+    ThreadPoolContext *tp_internal;
+
+    uint64_t solve_count;
+    uint64_t solve_time;
+
+    uint64_t interp_geometry_count;
+    uint64_t interp_geometry_time;
+
+    uint64_t calc_eq_coeffs_count;
+    uint64_t calc_eq_coeffs_time;
+};
+
+typedef struct MDCalcEqThread {
+    MDSolver             *ctx;
+    MDEquationContext *eq_ctx;
+    size_t      block_size;
+} MDCalcEqThread;
+
+/* mapping between our indices and thorn names */
+static const char *metric_vars[] = {
+    [GTXX]  = "ML_BSSN::gt11",
+    [GTYY]  = "ML_BSSN::gt22",
+    [GTZZ]  = "ML_BSSN::gt33",
+    [GTXY]  = "ML_BSSN::gt12",
+    [GTXZ]  = "ML_BSSN::gt13",
+    [GTYZ]  = "ML_BSSN::gt23",
+    [ATXX]  = "ML_BSSN::At11",
+    [ATYY]  = "ML_BSSN::At22",
+    [ATZZ]  = "ML_BSSN::At33",
+    [ATXY]  = "ML_BSSN::At12",
+    [ATXZ]  = "ML_BSSN::At13",
+    [ATYZ]  = "ML_BSSN::At23",
+    [PHI]   = "ML_BSSN::phi",
+    [XTX]   = "ML_BSSN::Xt1",
+    [XTY]   = "ML_BSSN::Xt2",
+    [XTZ]   = "ML_BSSN::Xt3",
+    [ALPHA] = "ML_BSSN::alpha",
+    [TRK]   = "ML_BSSN::trK",
+};
+
+/* mapping between the cactus grid values and interpolated values */
+static const CCTK_INT interp_operation_indices[] = {
+    [I_GTXX]     = GTXX,
+    [I_GTYY]     = GTYY,
+    [I_GTZZ]     = GTZZ,
+    [I_GTXY]     = GTXY,
+    [I_GTXZ]     = GTXZ,
+    [I_GTYZ]     = GTYZ,
+    [I_GTXX_DX]  = GTXX,
+    [I_GTYY_DX]  = GTYY,
+    [I_GTZZ_DX]  = GTZZ,
+    [I_GTXZ_DX]  = GTXZ,
+    [I_GTXX_DZ]  = GTXX,
+    [I_GTYY_DZ]  = GTYY,
+    [I_GTZZ_DZ]  = GTZZ,
+    [I_GTXZ_DZ]  = GTXZ,
+    [I_GTXX_DXX] = GTXX,
+    [I_GTYY_DXX] = GTYY,
+    [I_GTZZ_DXX] = GTZZ,
+    [I_GTXZ_DXX] = GTXZ,
+    [I_GTXX_DXZ] = GTXX,
+    [I_GTYY_DXZ] = GTYY,
+    [I_GTZZ_DXZ] = GTZZ,
+    [I_GTXZ_DXZ] = GTXZ,
+    [I_GTXX_DZZ] = GTXX,
+    [I_GTYY_DZZ] = GTYY,
+    [I_GTZZ_DZZ] = GTZZ,
+    [I_GTXZ_DZZ] = GTXZ,
+    [I_PHI]      = PHI,
+    [I_PHI_DX]   = PHI,
+    [I_PHI_DY]   = PHI,
+    [I_PHI_DZ]   = PHI,
+    [I_PHI_DXX]  = PHI,
+    [I_PHI_DZZ]  = PHI,
+    [I_PHI_DXZ]  = PHI,
+    [I_ATXX]     = ATXX,
+    [I_ATYY]     = ATYY,
+    [I_ATZZ]     = ATZZ,
+    [I_ATXY]     = ATXY,
+    [I_ATXZ]     = ATXZ,
+    [I_ATYZ]     = ATYZ,
+    [I_ATXX_DX]  = ATXX,
+    [I_ATYY_DX]  = ATYY,
+    [I_ATZZ_DX]  = ATZZ,
+    [I_ATXZ_DX]  = ATXZ,
+    [I_ATXX_DZ]  = ATXX,
+    [I_ATYY_DZ]  = ATYY,
+    [I_ATZZ_DZ]  = ATZZ,
+    [I_ATXZ_DZ]  = ATXZ,
+    [I_XTX]      = XTX,
+    [I_XTY]      = XTY,
+    [I_XTZ]      = XTZ,
+    [I_ALPHA]    = ALPHA,
+    [I_ALPHA_DX] = ALPHA,
+    [I_ALPHA_DY] = ALPHA,
+    [I_ALPHA_DZ] = ALPHA,
+    [I_TRK]      = TRK,
+    [I_TRK_DX]   = TRK,
+    [I_TRK_DZ]   = TRK,
+};
+
+/* the operation (plain value or x/y/z-derivative) to apply during interpolation */
+static const CCTK_INT interp_operation_codes[] = {
+    [I_GTXX]     = 0,
+    [I_GTYY]     = 0,
+    [I_GTZZ]     = 0,
+    [I_GTXY]     = 0,
+    [I_GTXZ]     = 0,
+    [I_GTYZ]     = 0,
+    [I_GTXX_DX]  = 1,
+    [I_GTYY_DX]  = 1,
+    [I_GTZZ_DX]  = 1,
+    [I_GTXZ_DX]  = 1,
+    [I_GTXX_DZ]  = 3,
+    [I_GTYY_DZ]  = 3,
+    [I_GTZZ_DZ]  = 3,
+    [I_GTXZ_DZ]  = 3,
+    [I_GTXX_DXX] = 11,
+    [I_GTYY_DXX] = 11,
+    [I_GTZZ_DXX] = 11,
+    [I_GTXZ_DXX] = 11,
+    [I_GTXX_DXZ] = 13,
+    [I_GTYY_DXZ] = 13,
+    [I_GTZZ_DXZ] = 13,
+    [I_GTXZ_DXZ] = 13,
+    [I_GTXX_DZZ] = 33,
+    [I_GTYY_DZZ] = 33,
+    [I_GTZZ_DZZ] = 33,
+    [I_GTXZ_DZZ] = 33,
+    [I_PHI]      = 0,
+    [I_PHI_DX]   = 1,
+    [I_PHI_DY]   = 2,
+    [I_PHI_DZ]   = 3,
+    [I_PHI_DXX]  = 11,
+    [I_PHI_DZZ]  = 33,
+    [I_PHI_DXZ]  = 13,
+    [I_ATXX]     = 0,
+    [I_ATYY]     = 0,
+    [I_ATZZ]     = 0,
+    [I_ATXY]     = 0,
+    [I_ATXZ]     = 0,
+    [I_ATYZ]     = 0,
+    [I_ATXX_DX]  = 1,
+    [I_ATYY_DX]  = 1,
+    [I_ATZZ_DX]  = 1,
+    [I_ATXZ_DX]  = 1,
+    [I_ATXX_DZ]  = 3,
+    [I_ATYY_DZ]  = 3,
+    [I_ATZZ_DZ]  = 3,
+    [I_ATXZ_DZ]  = 3,
+    [I_XTX]      = 0,
+    [I_XTY]      = 0,
+    [I_XTZ]      = 0,
+    [I_ALPHA]    = 0,
+    [I_ALPHA_DX] = 1,
+    [I_ALPHA_DY] = 2,
+    [I_ALPHA_DZ] = 3,
+    [I_TRK]      = 0,
+    [I_TRK_DX]   = 1,
+    [I_TRK_DZ]   = 3,
+};
+
+/* interpolate the cactus gridfunctions onto the pseudospectral grid */
+static int interp_geometry(MDSolver *ctx)
+{
+    MDSolverPriv *s = ctx->priv;
+    int ret;
+
+    for (int i = 0; i < ctx->nb_equations; i++) {
+        MDEquationContext *eq_ctx = &s->eqs[i];
+
+        ret = CCTK_InterpGridArrays(s->gh, 3, s->interp_operator, s->interp_params,
+                                    s->coord_system, NB_COLLOC_POINTS(ctx), CCTK_VARIABLE_REAL,
+                                    (const void * const *)eq_ctx->interp_coords, ARRAY_ELEMS(s->interp_vars_indices), s->interp_vars_indices,
+                                    ARRAY_ELEMS(eq_ctx->interp_values), s->interp_value_codes, (void * const *)eq_ctx->interp_values);
+        if (ret < 0)
+            CCTK_WARN(0, "Error interpolating");
+    }
+
+    return 0;
+}
+
+#if 0
+#define EQUATION 0
+#include "md_solve_template.c"
+#undef EQUATION
+
+#define EQUATION 1
+#include "md_solve_template.c"
+#undef EQUATION
+#else
+#define EQUATION 0
+#include "gamma_freeze_template.c"
+#undef EQUATION
+
+#define EQUATION 1
+#include "gamma_freeze_template.c"
+#undef EQUATION
+#endif
+
+static void (*calc_eq_coeffs[2])(void *, unsigned int, unsigned int,
+                                 unsigned int, unsigned int) = {
+    calc_eq_coeffs_0,
+    calc_eq_coeffs_1,
+};
+
+int md_solver_solve(MDSolver *ctx)
+{
+    MDSolverPriv *s = ctx->priv;
+    const double *(*eq_coeffs[2])[PSSOLVE_DIFF_ORDER_NB];
+    int ret;
+    int64_t start, totaltime_start;
+
+    totaltime_start = gettime();
+
+    /* interpolate the metric values and construct the quantities we'll need */
+    CCTK_TimerStart("MinimalDistortion_interp_geometry");
+    start = gettime();
+
+    ret = interp_geometry(ctx);
+
+    s->interp_geometry_time += gettime() - start;
+    s->interp_geometry_count++;
+    CCTK_TimerStop("MinimalDistortion_interp_geometry");
+    if (ret < 0)
+        return ret;
+
+    CCTK_TimerStart("MinimalDistortion_calc_eq_coeffs");
+    start = gettime();
+
+    for (int i = 0; i < ctx->nb_equations; i++) {
+        MDCalcEqThread thread = {
+            .ctx        = ctx,
+            .eq_ctx     = &s->eqs[i],
+            .block_size = 256,
+        };
+
+        md_threadpool_execute(s->tp, (NB_COLLOC_POINTS(ctx) + thread.block_size - 1) / thread.block_size,
+                              calc_eq_coeffs[i], &thread);
+    }
+
+    eq_coeffs[0] = s->eqs[0].eq_coeffs;
+    eq_coeffs[1] = s->eqs[1].eq_coeffs;
+
+    s->calc_eq_coeffs_time += gettime() - start;
+    s->calc_eq_coeffs_count++;
+    CCTK_TimerStop("MinimalDistortion_calc_eq_coeffs");
+    if (ret < 0)
+        return ret;
+
+    ret = md_pssolve_solve(s->ps_ctx,
+                           eq_coeffs,
+                           s->rhs, ctx->coeffs);
+    if (ret < 0)
+        return ret;
+
+    //for (int i = 0; i < ctx->nb_equations * NB_COEFFS(ctx); i++)
+    //    ctx->coeffs[i] *= s->coeff_scale[i];
+
+    s->solve_count++;
+    s->solve_time += gettime() - totaltime_start;
+
+    return 0;
+}
+
+void md_solver_print_stats(MDSolver *ctx)
+{
+    MDSolverPriv *s = ctx->priv;
+
+    fprintf(stderr,
+            "%g%% interpolate geometry: %lu, "
+            "total time %g s, avg time per call %g ms\n",
+            (double)s->interp_geometry_time * 100 / s->solve_time,
+            s->interp_geometry_count, (double)s->interp_geometry_time / 1e6,
+            (double)s->interp_geometry_time / s->interp_geometry_count / 1e3);
+    fprintf(stderr,
+            "%g%% calc equation coefficients: %lu, "
+            "total time %g s, avg time per call %g ms\n",
+            (double)s->calc_eq_coeffs_time * 100 / s->solve_time,
+            s->calc_eq_coeffs_count, (double)s->calc_eq_coeffs_time / 1e6,
+            (double)s->calc_eq_coeffs_time / s->calc_eq_coeffs_count / 1e3);
+    fprintf(stderr,
+            "%g%% pseudospectral matrix construction: %lu, "
+            "total time %g s, avg time per call %g ms\n",
+            (double)s->ps_ctx->construct_matrix_time * 100 / s->solve_time,
+            s->ps_ctx->construct_matrix_count, (double)s->ps_ctx->construct_matrix_time / 1e6,
+            (double)s->ps_ctx->construct_matrix_time / s->ps_ctx->construct_matrix_count / 1e3);
+    fprintf(stderr,
+            "%g%% BiCGSTAB %lu solves, "
+            "%lu iterations, total time %g s, "
+            "avg iterations per solve %g, avg time per solve %g ms, "
+            "avg time per iteration %g ms\n",
+            (double)s->ps_ctx->cg_time_total * 100 / s->solve_time,
+            s->ps_ctx->cg_solve_count, s->ps_ctx->cg_iter_count, (double)s->ps_ctx->cg_time_total / 1e6,
+            (double)s->ps_ctx->cg_iter_count / s->ps_ctx->cg_solve_count,
+            (double)s->ps_ctx->cg_time_total / s->ps_ctx->cg_solve_count / 1e3,
+            (double)s->ps_ctx->cg_time_total / s->ps_ctx->cg_iter_count / 1e3);
+    fprintf(stderr,
+            "%g%% LU %lu solves, total time %g s, avg time per solve %g ms\n",
+            (double)s->ps_ctx->lu_solves_time * 100 / s->solve_time,
+            s->ps_ctx->lu_solves_count, (double)s->ps_ctx->lu_solves_time / 1e6,
+            (double)s->ps_ctx->lu_solves_time / s->ps_ctx->lu_solves_count / 1e3);
+}
+
+static void init_opencl(MDSolver *ctx)
+#if HAVE_OPENCL
+{
+    MDSolverPriv *s = ctx->priv;
+    int err, count;
+    cl_platform_id platform;
+    cl_context_properties props[3];
+    cl_device_id ocl_device;
+
+    err = clGetPlatformIDs(1, &platform, &count);
+    if (err != CL_SUCCESS || count < 1) {
+        fprintf(stderr, "Could not get an OpenCL platform ID\n");
+        return;
+    }
+
+    err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &ocl_device, &count);
+    if (err != CL_SUCCESS || count < 1) {
+        fprintf(stderr, "Could not get an OpenCL device ID\n");
+        return;
+    }
+
+    props[0] = CL_CONTEXT_PLATFORM;
+    props[1] = (cl_context_properties)platform;
+    props[2] = 0;
+
+    s->ocl_ctx = clCreateContext(props, 1, &ocl_device, NULL, NULL, &err);
+    if (err != CL_SUCCESS || !s->ocl_ctx) {
+        fprintf(stderr, "Could not create an OpenCL context\n");
+        return;
+    }
+
+    s->ocl_queue = clCreateCommandQueue(s->ocl_ctx, ocl_device, 0, &err);
+    if (err != CL_SUCCESS || !s->ocl_queue) {
+        fprintf(stderr, "Could not create an OpenCL command queue: %d\n", err);
+        goto fail;
+    }
+
+    err = clblasSetup();
+    if (err != CL_SUCCESS) {
+        fprintf(stderr, "Error setting up clBLAS\n");
+        goto fail;
+    }
+
+    return;
+fail:
+    if (s->ocl_queue)
+        clReleaseCommandQueue(s->ocl_queue);
+    s->ocl_queue = 0;
+
+    if (s->ocl_ctx)
+        clReleaseContext(s->ocl_ctx);
+    s->ocl_ctx = 0;
+}
+#else
+{
+}
+#endif
+
+static int eq_init(MDSolver *ctx, unsigned int eq_idx)
+{
+    MDSolverPriv *s = ctx->priv;
+    MDEquationContext *eq_ctx = &s->eqs[eq_idx];
+    double *colloc_grid[2] = { s->ps_ctx->colloc_grid[eq_idx][0],
+                               s->ps_ctx->colloc_grid[eq_idx][1] };
+    int ret;
+
+    /* prepare the state for the cactus interpolator */
+    for (int i = 0; i < ARRAY_ELEMS(eq_ctx->interp_coords); i++) {
+        ret = posix_memalign((void**)&eq_ctx->interp_coords[i], 32,
+                             NB_COLLOC_POINTS(ctx) * sizeof(*eq_ctx->interp_coords[i]));
+        if (ret)
+            return -ENOMEM;
+    }
+
+    for (int j = 0; j < ctx->nb_colloc_points[1]; j++) {
+        for (int i = 0; i < ctx->nb_colloc_points[0]; i++) {
+            eq_ctx->interp_coords[0][j * ctx->nb_colloc_points[0] + i] = colloc_grid[0][i];
+            eq_ctx->interp_coords[1][j * ctx->nb_colloc_points[0] + i] = 0;
+            eq_ctx->interp_coords[2][j * ctx->nb_colloc_points[0] + i] = colloc_grid[1][j];
+        }
+    }
+
+    for (int i = 0; i < ARRAY_ELEMS(eq_ctx->interp_values); i++) {
+        ret = posix_memalign((void**)&eq_ctx->interp_values[i], 32,
+                             NB_COLLOC_POINTS(ctx) * sizeof(*eq_ctx->interp_values[i]));
+        if (ret)
+            return -ENOMEM;
+    }
+
+    /* allocate the equation coefficients */
+    eq_ctx->eq_coeffs = calloc(ctx->nb_equations, sizeof(*eq_ctx->eq_coeffs));
+    if (!eq_ctx->eq_coeffs)
+        return -ENOMEM;
+    for (int i = 0; i < ctx->nb_equations; i++)
+        for (int j = 0; j < ARRAY_ELEMS(eq_ctx->eq_coeffs[i]); j++) {
+            ret = posix_memalign((void**)&eq_ctx->eq_coeffs[i][j], 32,
+                                 NB_COLLOC_POINTS(ctx) * sizeof(*eq_ctx->eq_coeffs[i][j]));
+            if (ret)
+                return -ENOMEM;
+        }
+
+    /* setup the RHS pointer */
+    if (eq_idx == 0)
+        eq_ctx->rhs = s->rhs;
+    else
+        eq_ctx->rhs = s->eqs[eq_idx - 1].rhs + NB_COLLOC_POINTS(ctx);
+
+    return 0;
+}
+
+static const enum MDBasisFamily basis_sets[2][2] = {
+    { MD_BASIS_FAMILY_SB_ODD,  MD_BASIS_FAMILY_SB_EVEN },
+    { MD_BASIS_FAMILY_SB_EVEN, MD_BASIS_FAMILY_SB_ODD  },
+};
+
+int md_solver_init(MDSolver **pctx,
+                   cGH *cctkGH, ThreadPoolContext *tp,
+                   unsigned int nb_equations,
+                   unsigned int (*basis_order)[2],
+                   double sf, double filter_power, double input_filter_power)
+{
+    MDSolver *ctx;
+    MDSolverPriv *s;
+    int max_order = 0;
+    int ret;
+
+    ctx = calloc(1, sizeof(*ctx));
+    if (!ctx)
+        return -ENOMEM;
+
+    ctx->priv = calloc(1, sizeof(*ctx->priv));
+    if (!ctx->priv)
+        goto fail;
+    s = ctx->priv;
+
+    s->gh = cctkGH;
+
+    if (tp) {
+        s->tp = tp;
+    } else {
+        ret = md_threadpool_init(&s->tp_internal, 1);
+        if (ret < 0)
+            goto fail;
+        s->tp = s->tp_internal;
+    }
+
+    s->eqs = calloc(nb_equations, sizeof(*s->eqs));
+    if (!s->eqs)
+        goto fail;
+    ctx->nb_equations = nb_equations;
+
+    ctx->nb_coeffs[0] = basis_order[0][0];
+    ctx->nb_coeffs[1] = basis_order[0][1];
+
+    ctx->nb_colloc_points[0] = basis_order[0][0];
+    ctx->nb_colloc_points[1] = basis_order[0][1];
+
+    if (NB_COLLOC_POINTS(ctx) != NB_COEFFS(ctx))
+        CCTK_WARN(0, "Non-square collocation matrix");
+
+    s->colloc_grid_order[0] = ctx->nb_colloc_points[0];
+    s->colloc_grid_order[1] = ctx->nb_colloc_points[1];
+
+    ret  = posix_memalign((void**)&ctx->coeffs, 32, sizeof(*ctx->coeffs) * nb_equations * NB_COEFFS(ctx));
+    ret |= posix_memalign((void**)&s->rhs,      32, sizeof(*s->rhs)      * nb_equations * NB_COLLOC_POINTS(ctx));
+    if (ret)
+        goto fail;
+
+    for (int i = 0; i < ctx->nb_equations; i++)
+        for (int j = 0; j < 2; j++) {
+            double sf;
+
+            ret = md_basis_init(&ctx->basis[i][j], basis_sets[i][j], 1.0);
+            if (ret < 0)
+                goto fail;
+
+            sf = 64.0 / md_basis_colloc_point(ctx->basis[i][j], s->colloc_grid_order[j],
+                                               ctx->nb_colloc_points[j] - 1);
+            md_basis_free(&ctx->basis[i][j]);
+
+            ret = md_basis_init(&ctx->basis[i][j], basis_sets[i][j], sf);
+            if (ret < 0)
+                goto fail;
+        }
+
+    init_opencl(ctx);
+
+    ret = md_pssolve_context_alloc(&s->ps_ctx, 2);
+    if (ret < 0)
+        CCTK_WARN(0, "Error allocating the pseudospectral solver");
+
+    for (int i = 0; i < 2; i++)
+        for (int j = 0; j < 2; j++) {
+            s->ps_ctx->basis[i][j]       = ctx->basis[i][j];
+            s->ps_ctx->solve_order[i][j] = basis_order[i][j];
+            max_order = MAX(max_order, basis_order[i][j]);
+        }
+
+    s->ps_ctx->tp        = s->tp;
+
+#if HAVE_OPENCL
+    s->ps_ctx->ocl_ctx   = s->ocl_ctx;
+    s->ps_ctx->ocl_queue = s->ocl_queue;
+#endif
+
+    ret = md_pssolve_context_init(s->ps_ctx);
+    if (ret < 0)
+        CCTK_WARN(0, "Error initializing the pseudospectral solver");
+
+    for (int i = 0; i < max_order; i++) {
+        fprintf(stderr, "%d ", i);
+        for (int j = 0; j < 2; j++)
+            for (int k = 0; k < 2; k++) {
+                if (i < s->ps_ctx->solve_order[j][k])
+                    fprintf(stderr, "%8.8g\t", s->ps_ctx->colloc_grid[j][k][i]);
+                else
+                    fprintf(stderr, "        ");
+            }
+        fprintf(stderr, "\n");
+    }
+
+    /* init the per-equation state */
+    for (int i = 0; i < ctx->nb_equations; i++) {
+        ret = eq_init(ctx, i);
+        if (ret < 0)
+            goto fail;
+    }
+
+    ret = posix_memalign((void**)&s->coeff_scale, 32, 2 * NB_COEFFS(ctx) * sizeof(*s->coeff_scale));
+    if (ret)
+        goto fail;
+    for (int j = 0; j < ctx->nb_coeffs[1]; j++)
+        for (int i = 0; i < ctx->nb_coeffs[0]; i++) {
+            s->coeff_scale[j * ctx->nb_coeffs[0] + i] = exp(-36.0 * pow((double)i / ctx->nb_coeffs[0], filter_power)) *
+                                                        exp(-36.0 * pow((double)j / ctx->nb_coeffs[1], filter_power));
+            s->coeff_scale[NB_COEFFS(ctx) + j * ctx->nb_coeffs[0] + i] = exp(-36.0 * pow((double)i / ctx->nb_coeffs[0], filter_power)) *
+                                                                         exp(-36.0 * pow((double)j / ctx->nb_coeffs[1], filter_power));
+        }
+
+    for (int i = 0; i < ARRAY_ELEMS(s->interp_values); i++) {
+#if 0
+        ret = posix_memalign((void**)&s->interp_values[i], 32,
+                             2 * NB_COLLOC_POINTS(ctx) * sizeof(*s->interp_values[i]));
+        if (ret)
+            goto fail;
+#endif
+        s->interp_value_codes[i] = CCTK_VARIABLE_REAL;
+    }
+
+    for (int i = 0; i < ARRAY_ELEMS(metric_vars); i++) {
+        s->interp_vars_indices[i] = CCTK_VarIndex(metric_vars[i]);
+        if (s->interp_vars_indices[i] < 0)
+            CCTK_VWarn(0, __LINE__, __FILE__, CCTK_THORNSTRING, "Error getting the index of variable: %s\n", metric_vars[i]);
+    }
+
+    s->coord_system = CCTK_CoordSystemHandle("cart3d");
+    if (s->coord_system < 0)
+        CCTK_WARN(0, "Error getting the coordinate system");
+
+    s->interp_operator = CCTK_InterpHandle("Lagrange polynomial interpolation (tensor product)");
+    if (s->interp_operator < 0)
+        CCTK_WARN(0, "Error getting the interpolation operator");
+
+    s->interp_params = Util_TableCreateFromString("order=4 want_global_mode=1");
+    if (s->interp_params < 0)
+        CCTK_WARN(0, "Error creating interpolation parameters table");
+
+    ret = Util_TableSetIntArray(s->interp_params, NB_INTERP_VARS,
+                                interp_operation_codes, "operation_codes");
+    if (ret < 0)
+        CCTK_WARN(0, "Error setting operation codes");
+
+    ret = Util_TableSetIntArray(s->interp_params, NB_INTERP_VARS,
+                                interp_operation_indices, "operand_indices");
+    if (ret < 0)
+        CCTK_WARN(0, "Error setting operand indices");
+
+    CCTK_TimerCreate("MinimalDistortion_Solve");
+    CCTK_TimerCreate("MinimalDistortion_Expand");
+    CCTK_TimerCreate("MinimalDistortion_interp_geometry");
+    CCTK_TimerCreate("MinimalDistortion_calc_eq_coeffs");
+    CCTK_TimerCreate("MinimalDistortion_construct_matrix");
+    CCTK_TimerCreate("MinimalDistortion_solve_LU");
+    CCTK_TimerCreate("MinimalDistortion_solve_BiCGSTAB");
+
+    *pctx = ctx;
+    return 0;
+fail:
+    md_solver_free(&ctx);
+    return -ENOMEM;
+}
+
+void md_solver_free(MDSolver **pctx)
+{
+    MDSolver *ctx = *pctx;
+
+    if (!ctx)
+        return;
+
+    if (ctx->priv) {
+        for (int i = 0; i < ARRAY_ELEMS(ctx->priv->interp_coords); i++)
+            free(ctx->priv->interp_coords[i]);
+        for (int i = 0; i < ARRAY_ELEMS(ctx->priv->interp_values); i++)
+            free(ctx->priv->interp_values[i]);
+        free(ctx->priv->rhs);
+        free(ctx->priv->coeff_scale);
+
+        for (int i = 0; i < ctx->nb_equations; i++) {
+            MDEquationContext *eq_ctx = &ctx->priv->eqs[i];
+            for (int j = 0; j < ARRAY_ELEMS(eq_ctx->interp_coords); j++)
+                free(eq_ctx->interp_coords[j]);
+            for (int j = 0; j < ARRAY_ELEMS(eq_ctx->interp_values); j++)
+                free(eq_ctx->interp_values[j]);
+
+            if (eq_ctx->eq_coeffs) {
+                for (int j = 0; j < ctx->nb_equations; j++)
+                    for (int k = 0; k < ARRAY_ELEMS(eq_ctx->eq_coeffs[j]); k++)
+                        free(eq_ctx->eq_coeffs[j][k]);
+            }
+            free(eq_ctx->eq_coeffs);
+        }
+        free(ctx->priv->eqs);
+
+        md_pssolve_context_free(&ctx->priv->ps_ctx);
+
+        md_threadpool_free(&ctx->priv->tp_internal);
+
+#if HAVE_OPENCL
+        if (ctx->priv->ocl_queue)
+            clReleaseCommandQueue(ctx->priv->ocl_queue);
+        if (ctx->priv->ocl_ctx)
+            clReleaseContext(ctx->priv->ocl_ctx);
+#endif
+    }
+
+    free(ctx->priv);
+
+    free(ctx->coeffs);
+
+    free(ctx);
+    *pctx = NULL;
+}
diff --git a/src/md_solve.h b/src/md_solve.h
new file mode 100644
index 0000000..07d313a
--- /dev/null
+++ b/src/md_solve.h
@@ -0,0 +1,58 @@
+/*
+ * Quasimaximal slicing -- actual solver code
+ * Copyright (C) 2016 Anton Khirnov <anton@khirnov.net>
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef MD_SOLVE_H
+#define MD_SOLVE_H
+
+#include "common.h"
+
+#include "cctk.h"
+
+#include "basis.h"
+#include "threadpool.h"
+
+typedef struct MDSolverPriv MDSolverPriv;
+
+typedef struct MDSolver {
+    MDSolverPriv *priv;
+
+    unsigned int nb_equations;
+
+    MDBasisSetContext *basis[2][2];
+
+    int nb_coeffs[2];
+    int nb_colloc_points[2];
+
+    double *coeffs;
+
+    ThreadPoolContext *tp;
+} MDSolver;
+
+int md_solver_init(MDSolver **ctx,
+                   cGH *cctkGH, ThreadPoolContext *tp,
+                   unsigned int nb_equations,
+                   unsigned int (*basis_order)[2],
+                   double sf, double filter_power, double input_filter_power);
+
+void md_solver_free(MDSolver **ctx);
+
+int md_solver_solve(MDSolver *ctx);
+
+void md_solver_print_stats(MDSolver *ctx);
+
+#endif /* MD_SOLVE_H */
diff --git a/src/md_solve_template.c b/src/md_solve_template.c
new file mode 100644
index 0000000..260405e
--- /dev/null
+++ b/src/md_solve_template.c
@@ -0,0 +1,577 @@
+/*
+ * Minimal distortion -- template for the equations definitions
+ * Copyright (C) 2016 Anton Khirnov <anton@khirnov.net>
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#define FUNC3(a, b) a ## _ ## b
+#define FUNC2(a, b) FUNC3(a, b)
+#define FUNC(name) FUNC2(name, EQUATION)
+
+/**
+ * A template for calculating the equation coefficients.
+ */
+static void FUNC(calc_eq_coeffs)(void *arg,
+                                 unsigned int job_idx,    unsigned int nb_jobs,
+                                 unsigned int thread_idx, unsigned int nb_threads)
+{
+    const MDCalcEqThread   *et = arg;
+    const MDSolver        *ctx = et->ctx;
+    MDEquationContext  *eq_ctx = et->eq_ctx;
+
+    const int start = job_idx * et->block_size;
+    const int end   = MIN((job_idx + 1) * et->block_size, NB_COLLOC_POINTS(ctx));
+
+    for (int i = start; i < end; i++) {
+        const double x = eq_ctx->interp_coords[0][i];
+        const double z = eq_ctx->interp_coords[2][i];
+        const int zaxis = x <= EPS;
+
+        double c1o3 = (1.0 / 3.0);
+
+        double gtu[3][3], g[3][3], gu[3][3];
+        double dg[3][3][3], d2g[3][3][3][3], dgu[3][3][3], G[3][3][3], dG[3][3][3][3];
+        double A[3][3], Au[3][3];
+        double dA[3][3][3], dAu[3][3][3];
+        double Ric[3][3], Ricm[3][3];
+        double rhs_x, rhs_z;
+
+        const double gtxx = eq_ctx->interp_values[I_GTXX][i];
+        const double gtyy = eq_ctx->interp_values[I_GTYY][i];
+        const double gtzz = eq_ctx->interp_values[I_GTZZ][i];
+        const double gtxy = eq_ctx->interp_values[I_GTXY][i];
+        const double gtxz = eq_ctx->interp_values[I_GTXZ][i];
+        const double gtyz = eq_ctx->interp_values[I_GTYZ][i];
+
+        const double gt[3][3] = {{ gtxx, gtxy, gtxz },
+                                 { gtxy, gtyy, gtyz },
+                                 { gtxz, gtyz, gtzz }};
+
+        const double dx_gt11 = eq_ctx->interp_values[I_GTXX_DX][i];
+        const double dx_gt22 = eq_ctx->interp_values[I_GTYY_DX][i];
+        const double dx_gt33 = eq_ctx->interp_values[I_GTZZ_DX][i];
+        const double dx_gt13 = eq_ctx->interp_values[I_GTXZ_DX][i];
+
+        const double dz_gt11 = eq_ctx->interp_values[I_GTXX_DZ][i];
+        const double dz_gt22 = eq_ctx->interp_values[I_GTYY_DZ][i];
+        const double dz_gt33 = eq_ctx->interp_values[I_GTZZ_DZ][i];
+        const double dz_gt13 = eq_ctx->interp_values[I_GTXZ_DZ][i];
+
+        const double dgt[3][3][3] = {
+            {
+                { dx_gt11,     0.0, dx_gt13 },
+                {     0.0, dx_gt22,     0.0 },
+                { dx_gt13,     0.0, dx_gt33 },
+            },
+            {
+                {     0.0, zaxis ? dx_gt11 - dx_gt22 : (gtxx - gtyy) / x, 0.0 },
+                { zaxis ? dx_gt11 - dx_gt22 : (gtxx - gtyy) / x, 0.0, zaxis ? dx_gt13 : gtxz / x },
+                { 0.0, zaxis ? dx_gt13 : gtxz / x, 0.0 },
+            },
+            {
+                { dz_gt11,     0.0, dz_gt13 },
+                {     0.0, dz_gt22,     0.0 },
+                { dz_gt13,     0.0, dz_gt33 },
+            },
+        };
+
+        const double dxx_gt11 = eq_ctx->interp_values[I_GTXX_DXX][i];
+        const double dxx_gt22 = eq_ctx->interp_values[I_GTYY_DXX][i];
+        const double dxx_gt33 = eq_ctx->interp_values[I_GTZZ_DXX][i];
+        const double dxx_gt13 = eq_ctx->interp_values[I_GTXZ_DXX][i];
+
+        const double dxz_gt11 = eq_ctx->interp_values[I_GTXX_DXZ][i];
+        const double dxz_gt22 = eq_ctx->interp_values[I_GTYY_DXZ][i];
+        const double dxz_gt33 = eq_ctx->interp_values[I_GTZZ_DXZ][i];
+        const double dxz_gt13 = eq_ctx->interp_values[I_GTXZ_DXZ][i];
+
+        const double dzz_gt11 = eq_ctx->interp_values[I_GTXX_DZZ][i];
+        const double dzz_gt22 = eq_ctx->interp_values[I_GTYY_DZZ][i];
+        const double dzz_gt33 = eq_ctx->interp_values[I_GTZZ_DZZ][i];
+        const double dzz_gt13 = eq_ctx->interp_values[I_GTXZ_DZZ][i];
+
+        const double d2gt[3][3][3][3] = {
+            {
+                {
+                    { dxx_gt11,      0.0, dxx_gt13 },
+                    { 0.0,      dxx_gt22,      0.0 },
+                    { dxx_gt13,      0.0, dxx_gt33 },
+                },
+                {
+                    {      0.0, zaxis ? 0.5 * (dxx_gt11 - dxx_gt22) : (dx_gt11 - dx_gt22) / x - (gtxx - gtyy) / SQR(x), 0.0 },
+                    { zaxis ? 0.5 * (dxx_gt11 - dxx_gt22) : (dx_gt11 - dx_gt22) / x - (gtxx - gtyy) / SQR(x), 0.0,
+                        zaxis ? 0.5 * dxx_gt13 : dx_gt13 / x - gtxz / SQR(x) },
+                    { 0.0, zaxis ? 0.5 * dxx_gt13 : dx_gt13 / x - gtxz / SQR(x), 0.0 },
+                },
+                {
+                    { dxz_gt11,     0.0, dxz_gt13 },
+                    { 0.0,     dxz_gt22,      0.0 },
+                    { dxz_gt13,     0.0, dxz_gt33 },
+                },
+
+            },
+            {
+                {
+                    { 0.0, zaxis ? 0.5 * (dxx_gt11 - dxx_gt22) : (dx_gt11 - dx_gt22) / x - (gtxx - gtyy) / SQR(x), 0.0 },
+                    { zaxis ? 0.5 * (dxx_gt11 - dxx_gt22) : (dx_gt11 - dx_gt22) / x - (gtxx - gtyy) / SQR(x), 0.0,
+                        zaxis ? 0.5 * dxx_gt13 : dx_gt13 / x - gtxz / SQR(x) },
+                    { 0.0, zaxis ? 0.5 * dxx_gt13 : dx_gt13 / x - gtxz / SQR(x), 0.0 },
+                },
+                {
+                    { zaxis ? dxx_gt22 : dx_gt11 / x - 2 * (gtxx - gtyy) / SQR(x), 0.0,
+                       zaxis ? 0.5 * dxx_gt13 : dx_gt13 / x - gtxz / SQR(x) },
+                    { 0.0, zaxis ? dxx_gt11 : dx_gt22 / x + 2.0 * (gtxx - gtyy) / SQR(x), 0.0 },
+                    { zaxis ? 0.5 * dxx_gt13 : dx_gt13 / x - gtxz / SQR(x), 0.0, zaxis ? dxx_gt33 : dx_gt33 / x },
+                },
+                {
+                    { 0.0, zaxis ? dxz_gt11 - dxz_gt22 : (dz_gt11 - dz_gt22) / x, 0.0 },
+                    { zaxis ? dxz_gt11 - dxz_gt22 : (dz_gt11 - dz_gt22) / x, 0.0,
+                       zaxis ? dxz_gt13 : dz_gt13 / x },
+                    { 0.0, zaxis ? dxz_gt13 : dz_gt13 / x, 0.0 },
+                },
+
+            },
+            {
+                {
+                    { dxz_gt11,      0.0, dxz_gt13 },
+                    {      0.0, dxz_gt22,      0.0 },
+                    { dxz_gt13,      0.0, dxz_gt33 },
+                },
+                {
+                    { 0.0, zaxis ? dxz_gt11 - dxz_gt22 : (dz_gt11 - dz_gt22) / x, 0.0 },
+                    { zaxis ? dxz_gt11 - dxz_gt22 : (dz_gt11 - dz_gt22) / x, 0.0,
+                       zaxis ? dxz_gt13 : dz_gt13 / x },
+                    { 0.0, zaxis ? dxz_gt13 : dz_gt13 / x, 0.0 },
+                },
+                {
+                    { dzz_gt11,      0.0, dzz_gt13 },
+                    {      0.0, dzz_gt22,      0.0 },
+                    { dzz_gt13,      0.0, dzz_gt33 },
+                },
+
+            },
+        };
+
+        const double Atxx = eq_ctx->interp_values[I_ATXX][i];
+        const double Atyy = eq_ctx->interp_values[I_ATYY][i];
+        const double Atzz = eq_ctx->interp_values[I_ATZZ][i];
+        const double Atxy = eq_ctx->interp_values[I_ATXY][i];
+        const double Atxz = eq_ctx->interp_values[I_ATXZ][i];
+        const double Atyz = eq_ctx->interp_values[I_ATYZ][i];
+
+        const double dx_At11 = eq_ctx->interp_values[I_ATXX_DX][i];
+        const double dx_At22 = eq_ctx->interp_values[I_ATYY_DX][i];
+        const double dx_At33 = eq_ctx->interp_values[I_ATZZ_DX][i];
+        const double dx_At13 = eq_ctx->interp_values[I_ATXZ_DX][i];
+
+        const double dz_At11 = eq_ctx->interp_values[I_ATXX_DZ][i];
+        const double dz_At22 = eq_ctx->interp_values[I_ATYY_DZ][i];
+        const double dz_At33 = eq_ctx->interp_values[I_ATZZ_DZ][i];
+        const double dz_At13 = eq_ctx->interp_values[I_ATXZ_DZ][i];
+
+        const double dAt[3][3][3] = {
+            {
+                { dx_At11,     0.0, dx_At13 },
+                {     0.0, dx_At22,     0.0 },
+                { dx_At13,     0.0, dx_At33 },
+            },
+            {
+                {     0.0, zaxis ? dx_At11 - dx_At22 : (Atxx - Atyy) / x, 0.0 },
+                { zaxis ? dx_At11 - dx_At22 : (Atxx - Atyy) / x, 0.0, zaxis ? dx_At13 : Atxz / x },
+                { 0.0, zaxis ? dx_At13 : Atxz / x, 0.0 },
+            },
+            {
+                { dz_At11,     0.0, dz_At13 },
+                {     0.0, dz_At22,     0.0 },
+                { dz_At13,     0.0, dz_At33 },
+            },
+        };
+
+        const double phi    = eq_ctx->interp_values[I_PHI][i];
+
+        const double phi_dx = eq_ctx->interp_values[I_PHI_DX][i];
+        const double phi_dz = eq_ctx->interp_values[I_PHI_DZ][i];
+
+        const double dphi[3] = { phi_dx, 0.0, phi_dz };
+
+        const double phi_dxx = eq_ctx->interp_values[I_PHI_DXX][i];
+        const double phi_dzz = eq_ctx->interp_values[I_PHI_DZZ][i];
+        const double phi_dxz = eq_ctx->interp_values[I_PHI_DXZ][i];
+
+        const double d2phi[3][3] = {
+            { phi_dxx, 0.0, phi_dxz },
+            {     0.0, zaxis ? phi_dxx : phi_dx / x, 0.0 },
+            { phi_dxz,    0.0,  phi_dzz },
+        };
+
+        const double At[3][3] = {{ Atxx, Atxy, Atxz },
+                                 { Atxy, Atyy, Atyz },
+                                 { Atxz, Atyz, Atzz }};
+
+        const double alpha     = eq_ctx->interp_values[I_ALPHA][i];
+        const double dx_alpha  = eq_ctx->interp_values[I_ALPHA_DX][i];
+        const double dz_alpha  = eq_ctx->interp_values[I_ALPHA_DZ][i];
+
+        const double dalpha[3] = { dx_alpha, 0.0, dz_alpha };
+
+        const double Xtx  = eq_ctx->interp_values[I_XTX][i];
+        const double Xtz  = eq_ctx->interp_values[I_XTZ][i];
+
+        const double det = gtxx * gtyy * gtzz + 2 * gtxy * gtyz * gtxz - gtzz * SQR(gtxy) - SQR(gtxz) * gtyy - gtxx * SQR(gtyz);
+
+        // \tilde{γ}^{ij}
+        gtu[0][0] =  (gtyy * gtzz - SQR(gtyz)) / det;
+        gtu[1][1] =  (gtxx * gtzz - SQR(gtxz)) / det;
+        gtu[2][2] =  (gtxx * gtyy - SQR(gtxy)) / det;
+        gtu[0][1] = -(gtxy * gtzz - gtyz * gtxz) / det;
+        gtu[0][2] =  (gtxy * gtyz - gtyy * gtxz) / det;
+        gtu[1][2] = -(gtxx * gtyz - gtxy * gtxz) / det;
+        gtu[1][0] = gtu[0][1];
+        gtu[2][0] = gtu[0][2];
+        gtu[2][1] = gtu[1][2];
+
+        //　γ_{jk}/^{jk}
+        for (int j = 0; j < 3; j++)
+            for (int k = 0; k < 3; k++) {
+                gu[j][k] = SQR(phi) * gtu[j][k];
+                g[j][k]  = gt[j][k] / SQR(phi);
+            }
+
+        // ∂_j γ_{kl}
+        for (int j = 0; j < 3; j++)
+            for (int k = 0; k < 3; k++)
+                for (int l = 0; l < 3; l++) {
+                    dg[j][k][l] = -2.0 * dphi[j] * gt[k][l] / (phi * SQR(phi)) + dgt[j][k][l] / SQR(phi);
+                    dA[j][k][l] = -2.0 * dphi[j] * At[k][l] / (phi * SQR(phi)) + dAt[j][k][l] / SQR(phi);
+                }
+
+        // ∂_j γ^{kl}
+        for (int j = 0; j < 3; j++)
+            for (int k = 0; k < 3; k++)
+                for (int l = 0; l < 3; l++) {
+                    double val = 0.0;
+                    for (int m = 0; m < 3; m++)
+                        for (int n = 0; n < 3; n++)
+                            val += -gu[k][m] * gu[l][n] * dg[j][m][n];
+                    dgu[j][k][l] = val;
+                }
+
+        // ∂_{jk} g_{lm}
+        for (int j = 0; j < 3; j++)
+            for (int k = 0; k < 3; k++)
+                for (int l = 0; l < 3; l++)
+                    for (int m = 0; m < 3; m++) {
+                        d2g[j][k][l][m] = 6.0 *  gt      [l][m] * dphi[j] * dphi[k] / SQR(SQR(phi))    -
+                                          2.0 *  gt      [l][m] * d2phi[j][k]       / (phi * SQR(phi)) -
+                                          2.0 * dgt   [j][l][m] * dphi[k]           / (phi * SQR(phi)) -
+                                          2.0 * dgt   [k][l][m] * dphi[j]           / (phi * SQR(phi)) +
+                                               d2gt[j][k][l][m]                     / SQR(phi);
+                    }
+
+        // Γ^j_{kl}
+        for (int j = 0; j < 3; j++)
+            for (int k = 0; k < 3; k++)
+                for (int l = 0; l < 3; l++) {
+                    double val = 0.0;
+                    for (int m = 0; m < 3; m++)
+                        val += 0.5 * gu[j][m] * (dg[k][l][m] + dg[l][k][m] - dg[m][k][l]);
+                    G[j][k][l] = val;
+                }
+
+        // ∂_j Γ^k_{lm}
+        for (int j = 0; j < 3; j++)
+            for (int k = 0; k < 3; k++)
+                for (int l = 0; l < 3; l++)
+                    for (int m = 0; m < 3; m++) {
+                        double val = 0.0;
+                        for (int n = 0; n < 3; n++) {
+                            val += dgu[j][k][n] * (dg    [l][m][n] +  dg   [m][l][n] -  dg   [n][l][m]) +
+                                    gu   [k][n] * (d2g[j][l][m][n] + d2g[j][m][l][n] - d2g[j][n][l][m]);
+                        }
+                        dG[j][k][l][m] = 0.5 * val;
+                    }
+
+        // Ric_{jk}
+        for (int j = 0; j < 3; j++)
+            for (int k = 0; k < 3; k++) {
+                double val = 0.0;
+                for (int m = 0; m < 3; m++)
+                    val += dG[m][m][j][k] - dG[k][m][j][m];
+                for (int m = 0; m < 3; m++)
+                    for (int l = 0; l < 3; l++)
+                        val += G[l][l][m] * G[m][j][k] - G[l][k][m] * G[m][j][l];
+                Ric[j][k] = val;
+            }
+
+        // Ric^j_k
+        for (int j = 0; j < 3; j++)
+            for (int k = 0; k < 3; k++) {
+                double val = 0.0;
+                for (int l = 0; l < 3; l++)
+                    val += gu[j][l] * Ric[l][k];
+                Ricm[j][k] = val;
+            }
+
+        // A_{jk}
+        for (int j = 0; j < 3; j++)
+            for (int k = 0; k < 3; k++) {
+                A[j][k] = At[j][k] / SQR(phi);
+            }
+
+        // d_j A^{kl}
+        for (int j = 0; j < 3; j++)
+            for (int k = 0; k < 3; k++)
+                for (int l = 0; l < 3; l++) {
+                    double val = 0.0;
+                    for (int m = 0; m < 3; m++)
+                        for (int n = 0; n < 3; n++)
+                            val += dgu[j][k][m] * gu[l][n] * A[m][n] + gu[k][m] * dgu[j][l][n] * A[m][n] + gu[k][m] * gu[l][n] * dA[j][m][n];
+                    dAu[j][k][l] = val;
+                }
+
+        // A^{jk}
+        for (int j = 0; j < 3; j++)
+            for (int k = 0; k < 3; k++) {
+                double val = 0.0;
+                for (int l = 0; l < 3; l++)
+                    for (int m = 0; m < 3; m++)
+                        val += gu[j][l] * gu[k][m] * A[l][m];
+                Au[j][k] = val;
+            }
+
+        rhs_x = 0.0;
+        rhs_z = 0.0;
+        for (int j = 0; j < 3; j++) {
+            rhs_x += dalpha[j] * Au[0][j];
+            rhs_z += dalpha[j] * Au[2][j];
+        }
+        for (int j = 0; j < 3; j++) {
+            rhs_x += alpha * dAu[j][0][j];
+            rhs_z += alpha * dAu[j][2][j];
+        }
+        for (int j = 0; j < 3; j++) {
+            double val_x = 0.0;
+            double val_z = 0.0;
+            for (int k = 0; k < 3; k++) {
+                val_x += G[0][j][k] * Au[k][j];
+                val_z += G[2][j][k] * Au[k][j];
+            }
+            rhs_x += val_x * alpha;
+            rhs_z += val_z * alpha;
+        }
+        for (int j = 0; j < 3; j++) {
+            double val_x = 0.0;
+            double val_z = 0.0;
+            for (int k = 0; k < 3; k++) {
+                val_x += G[j][j][k] * Au[0][k];
+                val_z += G[j][j][k] * Au[2][k];
+            }
+            rhs_x += val_x * alpha;
+            rhs_z += val_z * alpha;
+        }
+
+        rhs_x *= 2.0;
+        rhs_z *= 2.0;
+
+        double X[3] = { 0.0 };
+        for (int j = 0; j < 3; j++)
+            for (int k = 0; k < 3; k++) {
+                X[0] += gu[j][k] * G[0][j][k];
+                X[2] += gu[j][k] * G[2][j][k];
+            }
+
+        if (EQUATION == 0) {
+            /* eq 0 */
+            /* ∂_{xx}β^x */
+            eq_ctx->eq_coeffs[0][PSSOLVE_DIFF_ORDER_20][i] = gu[0][0] + c1o3 * gu[0][0] + (zaxis ? 0.5 * (gu[1][1] + c1o3 * gu[0][0]) : 0.0);
+            /* ∂_{xx}β^z */
+            eq_ctx->eq_coeffs[1][PSSOLVE_DIFF_ORDER_20][i] = 0.0;
+            /* ∂_{zz}β^x */
+            eq_ctx->eq_coeffs[0][PSSOLVE_DIFF_ORDER_02][i] = gu[2][2];
+            /* ∂_{zz}β^z */
+            eq_ctx->eq_coeffs[1][PSSOLVE_DIFF_ORDER_02][i] = c1o3 * gu[0][2];
+
+            /* ∂_{xz}β^x */
+            eq_ctx->eq_coeffs[0][PSSOLVE_DIFF_ORDER_11][i] = 2.0 * gu[0][2] + c1o3 * gu[0][2] + (zaxis ? c1o3 * gu[0][2] : 0.0);
+            /* ∂_{xz}β^z */
+            eq_ctx->eq_coeffs[1][PSSOLVE_DIFF_ORDER_11][i] = c1o3 * gu[0][0];
+
+            /* ∂_{x}β^x */
+            {
+                double t0 = 0.0;
+                double t1 = 0.0;
+                for (int j = 0; j < 3; j++) {
+                    t0 += gu[0][j] * G[0][j][0];
+                    t1 += G[j][j][0];
+                }
+                eq_ctx->eq_coeffs[0][PSSOLVE_DIFF_ORDER_10][i] = 2.0 * t0 - X[0] + c1o3 * gu[0][0] * t1 + (zaxis ? 2.0 * gu[1][1] * G[0][1][1] : (gu[1][1] + c1o3 * gu[0][0]) / x);
+            }
+            /* ∂_{x}β^z */
+            {
+                double t0 = 0.0;
+                double t1 = 0.0;
+                for (int j = 0; j < 3; j++) {
+                    t0 += gu[0][j] * G[0][j][2];
+                    t1 += G[j][j][2];
+                }
+                eq_ctx->eq_coeffs[1][PSSOLVE_DIFF_ORDER_10][i] = 2.0 * t0 + c1o3 * gu[0][0] * t1;
+            }
+
+            /* ∂_{z}β^x */
+            {
+                double t0 = 0.0;
+                double t1 = 0.0;
+                for (int j = 0; j < 3; j++) {
+                    t0 += gu[2][j] * G[0][j][0];
+                    t1 += G[j][j][0];
+                }
+                eq_ctx->eq_coeffs[0][PSSOLVE_DIFF_ORDER_01][i] = 2.0 * t0 - X[2] + c1o3 * gu[0][2] * t1 + (zaxis ? 0.0 : c1o3 * gu[0][2] / x);
+            }
+            /* ∂_{z}β^z */
+            {
+                double t0 = 0.0;
+                double t1 = 0.0;
+                for (int j = 0; j < 3; j++) {
+                    t0 += gu[2][j] * G[0][j][2];
+                    t1 += G[j][j][2];
+                }
+                eq_ctx->eq_coeffs[1][PSSOLVE_DIFF_ORDER_01][i] = 2.0 * t0 + c1o3 * gu[0][2] * t1;
+            }
+
+            /* β^x */
+            {
+                double t0 = 0.0;
+                double t1 = 0.0;
+                for (int k = 0; k < 3; k++) {
+                    for (int l = 0; l < 3; l++) {
+                        double val = 0.0;
+                        for (int j = 0; j < 3; j++)
+                            val += G[0][k][j] * G[j][l][0] - G[j][k][l] * G[0][0][j];
+                        t0 += gu[k][l] * (dG[k][0][l][0] + val);
+                        t1 += gu[0][k] * dG[k][l][l][0];
+                    }
+                }
+                eq_ctx->eq_coeffs[0][PSSOLVE_DIFF_ORDER_00][i] = t0 + c1o3 * t1 + Ricm[0][0] + (zaxis ? 0.0 : 2.0 * gu[1][1] * G[0][1][1] / x - (gu[1][1] + c1o3 * gu[0][0]) / SQR(x));
+            }
+
+            /* β^z */
+            {
+                double t0 = 0.0;
+                double t1 = 0.0;
+                for (int k = 0; k < 3; k++) {
+                    for (int l = 0; l < 3; l++) {
+                        double val = 0.0;
+                        for (int j = 0; j < 3; j++)
+                            val += G[0][k][j] * G[j][l][2] - G[j][k][l] * G[0][2][j];
+                        t0 += gu[k][l] * (dG[k][0][l][2] + val);
+                        t1 += gu[0][k] * dG[k][l][l][2];
+                    }
+                }
+                eq_ctx->eq_coeffs[1][PSSOLVE_DIFF_ORDER_00][i] = t0 + c1o3 * t1 + Ricm[0][2];
+            }
+
+            eq_ctx->rhs[i]                                 = rhs_x;
+        } else {
+            /* eq 1 */
+            /* ∂_{xx}β^x */
+            eq_ctx->eq_coeffs[0][PSSOLVE_DIFF_ORDER_20][i] = c1o3 * gu[2][0] + (zaxis ? c1o3 * 0.5 * gu[2][0] : 0.0);
+            /* ∂_{xx}β^z */
+            eq_ctx->eq_coeffs[1][PSSOLVE_DIFF_ORDER_20][i] = gu[0][0] + (zaxis ? gu[1][1] : 0.0);
+            /* ∂_{zz}β^x */
+            eq_ctx->eq_coeffs[0][PSSOLVE_DIFF_ORDER_02][i] = 0.0;
+            /* ∂_{zz}β^z */
+            eq_ctx->eq_coeffs[1][PSSOLVE_DIFF_ORDER_02][i] = gu[2][2] + c1o3 * gu[2][2];
+            /* ∂_{xz}β^x */
+            eq_ctx->eq_coeffs[0][PSSOLVE_DIFF_ORDER_11][i] = c1o3 * gu[2][2] + (zaxis ? c1o3 * gu[2][2] : 0.0);
+            /* ∂_{xz}β^z */
+            eq_ctx->eq_coeffs[1][PSSOLVE_DIFF_ORDER_11][i] = 2.0 * gu[0][2] + c1o3 * gu[0][2];
+
+            /* ∂_{x}β^x */
+            {
+                double t0 = 0.0;
+                double t1 = 0.0;
+                for (int j = 0; j < 3; j++) {
+                    t0 += gu[0][j] * G[2][j][0];
+                    t1 += G[j][j][0];
+                }
+                eq_ctx->eq_coeffs[0][PSSOLVE_DIFF_ORDER_10][i] = 2.0 * t0 + c1o3 * gu[2][0] * t1 + (zaxis ? 2.0 * gu[1][1] * G[2][1][1] : c1o3 * gu[2][0] / x);
+            }
+            /* ∂_{x}β^z */
+            {
+                double t0 = 0.0;
+                double t1 = 0.0;
+                for (int j = 0; j < 3; j++) {
+                    t0 += gu[0][j] * G[2][j][2];
+                    t1 += G[j][j][2];
+                }
+                eq_ctx->eq_coeffs[1][PSSOLVE_DIFF_ORDER_10][i] = 2.0 * t0 - X[0] + c1o3 * gu[2][0] * t1 + (zaxis ? 0.0 : gu[1][1] / x);
+            }
+            /* ∂_{z}β^x */
+            {
+                double t0 = 0.0;
+                double t1 = 0.0;
+                for (int j = 0; j < 3; j++) {
+                    t0 += gu[2][j] * G[2][j][0];
+                    t1 += G[j][j][0];
+                }
+                eq_ctx->eq_coeffs[0][PSSOLVE_DIFF_ORDER_01][i] = 2.0 * t0 + c1o3 * gu[2][2] * t1 + (zaxis ? 0.0 : c1o3 * gu[2][2] / x);
+            }
+            /* ∂_{z}β^z */
+            {
+                double t0 = 0.0;
+                double t1 = 0.0;
+                for (int j = 0; j < 3; j++) {
+                    t0 += gu[2][j] * G[2][j][2];
+                    t1 += G[j][j][2];
+                }
+                eq_ctx->eq_coeffs[1][PSSOLVE_DIFF_ORDER_01][i] = 2.0 * t0 - X[2] + c1o3 * gu[2][2] * t1;
+            }
+
+            /* β^x */
+            {
+                double t0 = 0.0;
+                double t1 = 0.0;
+                for (int k = 0; k < 3; k++) {
+                    for (int l = 0; l < 3; l++) {
+                        double val = 0.0;
+                        for (int j = 0; j < 3; j++)
+                            val += G[2][k][j] * G[j][l][0] - G[j][k][l] * G[2][0][j];
+                        t0 += gu[k][l] * (dG[k][2][l][0] + val);
+                        t1 += gu[2][k] * dG[k][l][l][0];
+                    }
+                }
+                eq_ctx->eq_coeffs[0][PSSOLVE_DIFF_ORDER_00][i] = t0 + c1o3 * t1 + Ricm[2][0] + (zaxis ? 0.0 : 2.0 * gu[1][1] * G[2][1][1] / x - c1o3 * gu[2][0] / SQR(x));
+            }
+
+            /* β^z */
+            {
+                double t0 = 0.0;
+                double t1 = 0.0;
+                for (int k = 0; k < 3; k++) {
+                    for (int l = 0; l < 3; l++) {
+                        double val = 0.0;
+                        for (int j = 0; j < 3; j++)
+                            val += G[2][k][j] * G[j][l][2] - G[j][k][l] * G[2][2][j];
+                        t0 += gu[k][l] * (dG[k][2][l][2] + val);
+                        t1 += gu[2][k] * dG[k][l][l][2];
+                    }
+                }
+                eq_ctx->eq_coeffs[1][PSSOLVE_DIFF_ORDER_00][i] = t0 + c1o3 * t1 + Ricm[2][2];
+            }
+
+            eq_ctx->rhs[i] = rhs_z;
+        }
+    }
+}
diff --git a/src/pssolve.c b/src/pssolve.c
new file mode 100644
index 0000000..1f5bb44
--- /dev/null
+++ b/src/pssolve.c
@@ -0,0 +1,498 @@
+/*
+ * Pseudospectral 2nd order 2D linear PDE solver
+ * Copyright (C) 2016 Anton Khirnov <anton@khirnov.net>
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <errno.h>
+#include <inttypes.h>
+#include <limits.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <cblas.h>
+#include <lapacke.h>
+
+#include "bicgstab.h"
+#include "pssolve.h"
+#include "threadpool.h"
+
+#define NB_COEFFS(eq_ctx)        ((eq_ctx)->nb_coeffs[0]        * (eq_ctx)->nb_coeffs[1])
+#define NB_COLLOC_POINTS(eq_ctx) ((eq_ctx)->nb_colloc_points[0] * (eq_ctx)->nb_colloc_points[1])
+
+typedef struct PSEquationContext {
+    size_t nb_coeffs[2];
+    size_t nb_colloc_points[2];
+    size_t colloc_grid_order[2];
+
+    double *(*basis_val)[PSSOLVE_DIFF_ORDER_NB];
+    double *mat;
+} PSEquationContext;
+
+struct PSSolvePriv {
+    BiCGStabContext *bicgstab;
+    int steps_since_inverse;
+
+    size_t nb_coeffs;
+
+    PSEquationContext *eqs;
+
+    int *ipiv;
+    double *mat;
+
+    ThreadPoolContext *tp;
+    ThreadPoolContext *tp_internal;
+};
+
+typedef struct ConstructMatrixThread {
+    const PSEquationContext *eq_ctx;
+    const double **eq_coeffs;
+    double *mat;
+    ptrdiff_t mat_stride;
+    unsigned int var_idx;
+} ConstructMatrixThread;
+
+static void construct_matrix(void *arg,
+                             unsigned int job_idx,    unsigned int nb_jobs,
+                             unsigned int thread_idx, unsigned int nb_threads)
+{
+    ConstructMatrixThread      *cmt = arg;
+    const PSEquationContext *eq_ctx = cmt->eq_ctx;
+    const double        **eq_coeffs = cmt->eq_coeffs;
+    double                     *mat = cmt->mat;
+    ptrdiff_t            mat_stride = cmt->mat_stride;
+    unsigned int            var_idx = cmt->var_idx;
+    unsigned int          idx_coeff = job_idx;
+
+    for (int idx_grid = 0; idx_grid < NB_COLLOC_POINTS(eq_ctx); idx_grid++) {
+        const int idx = idx_grid + NB_COLLOC_POINTS(eq_ctx) * idx_coeff;
+        double val = 0.0;
+
+        for (int i = 0; i < PSSOLVE_DIFF_ORDER_NB; i++)
+            val += eq_coeffs[i][idx_grid] * eq_ctx->basis_val[var_idx][i][idx];
+
+        mat[idx_grid + mat_stride * idx_coeff] = val;
+    }
+}
+
+static int lu_invert(const int N, double *mat, double *rhs, int *ipiv)
+{
+    char  equed = 'N';
+    double cond, ferr, berr, rpivot;
+
+    double *mat_f, *x;
+    int ret = 0;
+
+#if 0
+    LAPACKE_dgesv(LAPACK_COL_MAJOR, N, 1,
+                  mat, N, ipiv, rhs, N);
+    LAPACKE_dgetri(LAPACK_COL_MAJOR, N, mat, N, ipiv);
+#else
+    mat_f = malloc(SQR(N) * sizeof(*mat_f));
+    x     = malloc(N * sizeof(*x));
+
+    //{
+    //    int i, j;
+    //    for (i = 0; i < N; i++) {
+    //        for (j = 0; j < N; j++)
+    //            fprintf(stderr, "%+#010.8g\t", mat[i + j * N]);
+    //        fprintf(stderr, "\n");
+    //    }
+    //}
+    //{
+    //    double *mat_copy = malloc(SQR(N) * sizeof(double));
+    //    double *svd = malloc(N * sizeof(double));
+    //    double *rhs_copy = malloc(N * sizeof(double));
+    //    int rank;
+
+    //    memcpy(mat_copy, mat, SQR(N) * sizeof(double));
+    //    memcpy(rhs_copy, rhs, N * sizeof(double));
+
+    //    LAPACKE_dgelsd(LAPACK_COL_MAJOR, N, N, 1, mat_copy, N, rhs_copy, N,
+    //                   svd, 1e-13, &rank);
+
+    //    free(mat_copy);
+    //    for (int i = 0; i < N; i++) {
+    //        if (i > 5 && i < N - 5)
+    //            continue;
+
+    //        fprintf(stderr, "%g\t", svd[i]);
+    //    }
+    //    fprintf(stderr, "\n rank %d\n", rank);
+    //    free(svd);
+    //    free(rhs_copy);
+
+    //    if (rank < N)
+    //        ret = 1;
+    //}
+
+    //LAPACKE_dgesv(LAPACK_COL_MAJOR, N, 1,
+    //              mat, N, ipiv, rhs, N);
+    LAPACKE_dgesvx(LAPACK_COL_MAJOR, 'N', 'N', N, 1,
+                   mat, N, mat_f, N, ipiv, &equed, NULL, NULL,
+                   rhs, N, x, N, &cond, &ferr, &berr, &rpivot);
+    LAPACKE_dgetri(LAPACK_COL_MAJOR, N, mat_f, N, ipiv);
+    memcpy(rhs, x, N * sizeof(double));
+    memcpy(mat, mat_f, SQR(N) * sizeof(double));
+
+    fprintf(stderr, "LU factorization solution to a %zdx%zd matrix: "
+            "condition number %16.16g; forward error %16.16g backward error %16.16g\n",
+            N, N, cond, ferr, berr);
+
+    free(mat_f);
+    free(x);
+#endif
+
+    return ret;
+}
+
+int md_pssolve_solve(PSSolveContext *ctx,
+                     const double *(**eq_coeffs)[PSSOLVE_DIFF_ORDER_NB],
+                     const double *rhs, double *coeffs)
+{
+    PSSolvePriv *s = ctx->priv;
+    double rhs_max;
+    int64_t start;
+
+    int ret = 0;
+
+    /* fill the matrix */
+    start = gettime();
+
+    for (int i = 0; i < ctx->nb_equations; i++) {
+        PSEquationContext *eq_ctx = &s->eqs[i];
+        double *mat = s->eqs[i].mat;
+
+        for (int j = 0; j < ctx->nb_equations; j++) {
+            ConstructMatrixThread thread = {
+                .eq_ctx     = eq_ctx,
+                .eq_coeffs  = eq_coeffs[i][j],
+                .mat        = mat,
+                .mat_stride = s->nb_coeffs,
+                .var_idx    = j,
+            };
+            md_threadpool_execute(s->tp, NB_COEFFS(&s->eqs[j]), construct_matrix,
+                                  &thread);
+            mat += NB_COEFFS(&s->eqs[j]) * s->nb_coeffs;
+        }
+    }
+
+    ctx->construct_matrix_time += gettime() - start;
+    ctx->construct_matrix_count++;
+
+#if 0
+    if (rhs_max < EPS) {
+        fprintf(stderr, "zero rhs\n");
+        memset(ms->coeffs, 0, sizeof(*ms->coeffs) * ms->nb_coeffs);
+        if (ms->cl_queue) {
+            clEnqueueWriteBuffer(ms->cl_queue, ms->ocl_coeffs, 1, 0, N * sizeof(double),
+                                 ms->coeffs, 0, NULL, NULL);
+        }
+        return 0;
+    }
+#endif
+
+    /* solve for the coeffs */
+    if (s->steps_since_inverse < 1024) {
+        int64_t start;
+
+        start = gettime();
+
+        CCTK_TimerStart("MinimalDistortion_solve_BiCGSTAB");
+        ret = md_bicgstab_solve(s->bicgstab, s->mat, rhs, coeffs);
+        CCTK_TimerStop("MinimalDistortion_solve_BiCGSTAB");
+
+        if (ret >= 0) {
+            ctx->cg_time_total += gettime() - start;
+            ctx->cg_solve_count++;
+            ctx->cg_iter_count += ret + 1;
+            s->steps_since_inverse++;
+
+        }
+    } else
+        ret = -1;
+
+    if (ret < 0) {
+        int64_t start;
+
+        CCTK_TimerStart("MinimalDistortion_solve_LU");
+        start = gettime();
+
+        memcpy(coeffs, rhs, s->nb_coeffs * sizeof(*rhs));
+
+        ret = lu_invert(s->nb_coeffs, s->mat, coeffs, s->ipiv);
+        ctx->lu_solves_time += gettime() - start;
+        ctx->lu_solves_count++;
+        CCTK_TimerStop("MinimalDistortion_solve_LU");
+
+        ret = md_bicgstab_init(s->bicgstab, s->mat, coeffs);
+
+        s->steps_since_inverse = 0;
+    }
+
+    return ret;
+}
+
+static int basis_val_init(PSSolveContext *ctx, unsigned int eq_idx)
+{
+    PSSolvePriv            *s = ctx->priv;
+    PSEquationContext *eq_ctx = &s->eqs[eq_idx];
+    int ret;
+
+    eq_ctx->basis_val = calloc(ctx->nb_equations, sizeof(*eq_ctx->basis_val));
+    if (!eq_ctx->basis_val)
+        return -ENOMEM;
+
+    for (int i = 0; i < ctx->nb_equations; i++) {
+        double *basis_val[2][3] = { { NULL } };
+
+        /* for each direction, compute the corresponding basis values/derivatives */
+        for (int dir = 0; dir < ARRAY_ELEMS(basis_val); dir++) {
+            for (int diff_order = 0; diff_order < ARRAY_ELEMS(basis_val[dir]); diff_order++) {
+                ret = posix_memalign((void**)&basis_val[dir][diff_order], 32,
+                                     sizeof(*basis_val[dir][diff_order]) * s->eqs[i].nb_coeffs[dir] * eq_ctx->nb_colloc_points[dir]);
+                if (ret) {
+                    ret = -ENOMEM;
+                    goto fail;
+                }
+            }
+
+            for (int k = 0; k < eq_ctx->nb_colloc_points[dir]; k++) {
+                double coord = ctx->colloc_grid[eq_idx][dir][k];
+                for (int l = 0; l < s->eqs[i].nb_coeffs[dir]; l++) {
+                    basis_val[dir][0][k * s->eqs[i].nb_coeffs[dir] + l] = md_basis_eval(ctx->basis[i][dir], MD_BASIS_EVAL_TYPE_VALUE, coord, l);
+                    basis_val[dir][1][k * s->eqs[i].nb_coeffs[dir] + l] = md_basis_eval(ctx->basis[i][dir], MD_BASIS_EVAL_TYPE_DIFF1, coord, l);
+                    basis_val[dir][2][k * s->eqs[i].nb_coeffs[dir] + l] = md_basis_eval(ctx->basis[i][dir], MD_BASIS_EVAL_TYPE_DIFF2, coord, l);
+                }
+            }
+        }
+
+        for (int diff = 0; diff < ARRAY_ELEMS(eq_ctx->basis_val[i]); diff++) {
+            ret = posix_memalign((void**)&eq_ctx->basis_val[i][diff], 32,
+                                 NB_COLLOC_POINTS(eq_ctx) * NB_COEFFS(eq_ctx) * sizeof(*eq_ctx->basis_val[i][diff]));
+            if (ret) {
+                ret = -ENOMEM;
+                goto fail;
+            }
+        }
+
+        for (int j = 0; j < eq_ctx->nb_colloc_points[1]; j++) {
+            const double   *basis1 = basis_val[1][0] + j * s->eqs[i].nb_coeffs[1];
+            const double  *dbasis1 = basis_val[1][1] + j * s->eqs[i].nb_coeffs[1];
+            const double *d2basis1 = basis_val[1][2] + j * s->eqs[i].nb_coeffs[1];
+
+            for (int k = 0; k < eq_ctx->nb_colloc_points[0]; k++) {
+                const double   *basis0 = basis_val[0][0] + k * s->eqs[i].nb_coeffs[0];
+                const double  *dbasis0 = basis_val[0][1] + k * s->eqs[i].nb_coeffs[0];
+                const double *d2basis0 = basis_val[0][2] + k * s->eqs[i].nb_coeffs[0];
+
+                const int idx_grid = j * eq_ctx->nb_colloc_points[0] + k;
+
+                for (int l = 0; l < s->eqs[i].nb_coeffs[1]; l++)
+                    for (int m = 0; m < s->eqs[i].nb_coeffs[0]; m++) {
+                        const int idx_coeff = l * s->eqs[i].nb_coeffs[0] + m;
+                        const int idx = idx_grid + NB_COLLOC_POINTS(eq_ctx) * idx_coeff;
+
+                        eq_ctx->basis_val[i][PSSOLVE_DIFF_ORDER_00][idx] =   basis0[m] *   basis1[l];
+                        eq_ctx->basis_val[i][PSSOLVE_DIFF_ORDER_10][idx] =  dbasis0[m] *   basis1[l];
+                        eq_ctx->basis_val[i][PSSOLVE_DIFF_ORDER_01][idx] =   basis0[m] *  dbasis1[l];
+                        eq_ctx->basis_val[i][PSSOLVE_DIFF_ORDER_20][idx] = d2basis0[m] *   basis1[l];
+                        eq_ctx->basis_val[i][PSSOLVE_DIFF_ORDER_02][idx] =   basis0[m] * d2basis1[l];
+                        eq_ctx->basis_val[i][PSSOLVE_DIFF_ORDER_11][idx] =  dbasis0[m] *  dbasis1[l];
+                    }
+            }
+        }
+
+fail:
+        for (int dir = 0; dir < ARRAY_ELEMS(basis_val); dir++)
+            for (int diff = 0; diff < ARRAY_ELEMS(basis_val[dir]); diff++)
+                free(basis_val[dir][diff]);
+        if (ret < 0)
+            return ret;
+    }
+
+    return 0;
+}
+
+int md_pssolve_context_init(PSSolveContext *ctx)
+{
+    PSSolvePriv *s = ctx->priv;
+    size_t N = 0;
+
+    int ret = 0;
+
+    if (ctx->tp) {
+        s->tp = ctx->tp;
+    } else {
+        ret = md_threadpool_init(&s->tp_internal, 1);
+        if (ret < 0)
+            return ret;
+        s->tp = s->tp_internal;
+    }
+
+    /* sanity check the parameters */
+    for (int i = 0; i < ctx->nb_equations; i++) {
+        if (!ctx->basis[i][0] || !ctx->basis[i][1]) {
+            fprintf(stderr, "Basis set for variable %d not set\n", i);
+            return -EINVAL;
+        }
+        if (!ctx->solve_order[i][0] || !ctx->solve_order[i][1]) {
+            fprintf(stderr, "Solver order for variable %d not set\n", i);
+            return -EINVAL;
+        }
+
+        N += ctx->solve_order[i][0] * ctx->solve_order[i][1];
+    }
+
+    ret  = posix_memalign((void**)&s->ipiv, 32, sizeof(*s->ipiv) * N);
+    ret |= posix_memalign((void**)&s->mat,  32, sizeof(*s->mat)  * N * N);
+    if (ret)
+        return -ENOMEM;
+
+    s->nb_coeffs = N;
+
+    ctx->colloc_grid = calloc(ctx->nb_equations, sizeof(*ctx->colloc_grid));
+    if (!ctx->colloc_grid)
+        return -ENOMEM;
+
+    /* initialize the per-equation state */
+    for (int i = 0; i < ctx->nb_equations; i++) {
+        PSEquationContext *eq_ctx = &s->eqs[i];
+
+        eq_ctx->nb_coeffs[0]         = ctx->solve_order[i][0];
+        eq_ctx->nb_coeffs[1]         = ctx->solve_order[i][1];
+        eq_ctx->nb_colloc_points[0]  = ctx->solve_order[i][0];
+        eq_ctx->nb_colloc_points[1]  = ctx->solve_order[i][1];
+        eq_ctx->colloc_grid_order[0] = ctx->solve_order[i][0];
+        eq_ctx->colloc_grid_order[1] = ctx->solve_order[i][1];
+
+        if (i == 0)
+            eq_ctx->mat = s->mat;
+        else
+            eq_ctx->mat = s->eqs[i - 1].mat + NB_COLLOC_POINTS(&s->eqs[i - 1]);
+
+        /* compute the collocation grid */
+        posix_memalign((void**)&ctx->colloc_grid[i][0], 32, eq_ctx->nb_colloc_points[0] * sizeof(*ctx->colloc_grid[i][0]));
+        posix_memalign((void**)&ctx->colloc_grid[i][1], 32, eq_ctx->nb_colloc_points[1] * sizeof(*ctx->colloc_grid[i][1]));
+        if (!ctx->colloc_grid[i][0] || !ctx->colloc_grid[i][1])
+            return -ENOMEM;
+
+        for (int j = 0; j < eq_ctx->nb_colloc_points[0]; j++)
+            ctx->colloc_grid[i][0][j] = md_basis_colloc_point(ctx->basis[i][0], eq_ctx->colloc_grid_order[0], j);
+        for (int j = 0; j < eq_ctx->nb_colloc_points[1]; j++)
+            ctx->colloc_grid[i][1][j] = md_basis_colloc_point(ctx->basis[i][1], eq_ctx->colloc_grid_order[1], j);
+
+    }
+
+    /* precompute the basis values we will need */
+    for (int i = 0; i < ctx->nb_equations; i++) {
+        ret = basis_val_init(ctx, i);
+        if (ret < 0)
+            return ret;
+    }
+
+    s->steps_since_inverse = INT_MAX;
+
+    /* init the BiCGStab solver */
+    ret = md_bicgstab_context_alloc(&s->bicgstab, N, ctx->ocl_ctx, ctx->ocl_queue);
+    if (ret < 0)
+        return ret;
+
+    return 0;
+}
+
+int md_pssolve_context_alloc(PSSolveContext **pctx, unsigned int nb_equations)
+{
+    PSSolveContext *ctx;
+
+    if (!nb_equations)
+        return -EINVAL;
+
+    ctx = calloc(1, sizeof(*ctx));
+    if (!ctx)
+        return -ENOMEM;
+
+    ctx->nb_equations = nb_equations;
+
+    ctx->priv = calloc(1, sizeof(*ctx->priv));
+    if (!ctx->priv)
+        goto fail;
+
+    ctx->priv->eqs = calloc(nb_equations, sizeof(*ctx->priv->eqs));
+    if (!ctx->priv->eqs)
+        goto fail;
+
+    ctx->basis = calloc(nb_equations, sizeof(*ctx->basis));
+    if (!ctx->basis)
+        goto fail;
+
+    ctx->solve_order = calloc(nb_equations, sizeof(*ctx->solve_order));
+    if (!ctx->solve_order)
+        goto fail;
+
+    *pctx = ctx;
+    return 0;
+fail:
+    md_pssolve_context_free(&ctx);
+    return -ENOMEM;
+}
+
+void md_pssolve_context_free(PSSolveContext **pctx)
+{
+    PSSolveContext *ctx = *pctx;
+
+    if (!ctx)
+        return;
+
+    if (ctx->priv) {
+        if (ctx->priv->eqs) {
+            for (int i = 0; i < ctx->nb_equations; i++) {
+                PSEquationContext *eq_ctx = &ctx->priv->eqs[i];
+
+                for (int j = 0; j < ctx->nb_equations; j++)
+                    for (int k = 0; k < ARRAY_ELEMS(eq_ctx->basis_val[j]); k++)
+                        free(eq_ctx->basis_val[j][k]);
+                free(eq_ctx->basis_val);
+            }
+        }
+
+        free(ctx->priv->eqs);
+
+        free(ctx->priv->ipiv);
+        free(ctx->priv->mat);
+
+        md_bicgstab_context_free(&ctx->priv->bicgstab);
+        md_threadpool_free(&ctx->priv->tp_internal);
+    }
+
+    free(ctx->priv);
+
+    if (ctx->colloc_grid) {
+        for (int i = 0; i < ctx->nb_equations; i++)
+            for (int j = 0; j < ARRAY_ELEMS(ctx->colloc_grid[i]); j++)
+                free(ctx->colloc_grid[i][j]);
+    }
+
+    free(ctx->colloc_grid[0]);
+    free(ctx->colloc_grid[1]);
+
+    free(ctx->basis);
+    free(ctx->solve_order);
+
+    free(ctx);
+    *pctx = NULL;
+}
diff --git a/src/pssolve.h b/src/pssolve.h
new file mode 100644
index 0000000..e6a4c1a
--- /dev/null
+++ b/src/pssolve.h
@@ -0,0 +1,139 @@
+/*
+ * Pseudospectral 2nd order 2D linear PDE solver
+ * Copyright (C) 2016 Anton Khirnov <anton@khirnov.net>
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef MD_PSSOLVE_H
+#define MD_PSSOLVE_H
+
+#include "common.h"
+
+#if HAVE_OPENCL
+#include <cl.h>
+#else
+typedef void* cl_context;
+typedef void* cl_command_queue;
+#endif
+
+#include <stdint.h>
+
+#include "basis.h"
+#include "threadpool.h"
+
+enum PSSolveDiffOrder {
+    PSSOLVE_DIFF_ORDER_00,
+    PSSOLVE_DIFF_ORDER_10,
+    PSSOLVE_DIFF_ORDER_01,
+    PSSOLVE_DIFF_ORDER_11,
+    PSSOLVE_DIFF_ORDER_20,
+    PSSOLVE_DIFF_ORDER_02,
+    PSSOLVE_DIFF_ORDER_NB,
+};
+
+typedef struct PSSolvePriv PSSolvePriv;
+
+typedef struct PSSolveContext {
+    /**
+     * Solver private data, not to be touched by the caller.
+     */
+    PSSolvePriv *priv;
+
+    /**
+     * Number of equations/unknown functions in the set.
+     * Set by md_pssolve_context_alloc().
+     */
+    unsigned int nb_equations;
+
+    /**
+     * The basis sets.
+     *
+     * basis[i][j] is the basis set used for i-th variable in j-th direction.
+     *
+     * The array is allocated by md_pssolve_context_alloc(), must be filled by
+     * by the caller before md_pssolve_context_init().
+     */
+    const MDBasisSetContext *(*basis)[2];
+
+    /**
+     * Order of the solver.
+     *
+     * solve_order[i][j] is the order of the solver (i.e. the number of the
+     * basis functions used) for i-th variable in j-th direction.
+     *
+     * Allocated by md_pssolve_context_alloc(), must be filled by the caller
+     * before md_pssolve_context_init().
+     */
+    unsigned int (*solve_order)[2];
+
+    /**
+     * Locations of the collocation points. The equation coefficients passed to
+     * md_pssolve_solve() should be evaluated at those grid positions.
+     *
+     * colloc_grid[i][j] is an array of length solve_order[i][j] and contains
+     * the collocation points for the i-th variable in the j-th direction.
+     *
+     * Set by the solver after md_pssolve_context_init().
+     */
+    double *(*colloc_grid)[2];
+
+    /**
+     * The thread pool used for multithreaded execution. May be set by the
+     * caller before md_pssolve_context_init(), otherwise a single thread will
+     * be used.
+     */
+    ThreadPoolContext *tp;
+
+    cl_context       ocl_ctx;
+    cl_command_queue ocl_queue;
+
+    uint64_t lu_solves_count;
+    uint64_t lu_solves_time;
+
+    uint64_t cg_solve_count;
+    uint64_t cg_iter_count;
+    uint64_t cg_time_total;
+
+    uint64_t construct_matrix_count;
+    uint64_t construct_matrix_time;
+} PSSolveContext;
+
+/**
+ * Allocate a new solver.
+ */
+int md_pssolve_context_alloc(PSSolveContext **ctx, unsigned int nb_equations);
+
+/**
+ * Initialize the solver for use after all the context options have been set.
+ */
+int md_pssolve_context_init(PSSolveContext *ctx);
+
+/**
+ * Free the solver and all its internal state.
+ */
+void md_pssolve_context_free(PSSolveContext **ctx);
+
+/**
+ * Solve a second order linear PDE in 2D with a pseudospectral method.
+ *
+ * @param eq_coeffs the equation coefficients.
+ * @param rhs the right-hand side of the equation at the collocation points.
+ * @param coeffs the spectral coefficients of the solution will be written here.
+ */
+int md_pssolve_solve(PSSolveContext *ctx,
+                     const double *(**eq_coeffs)[PSSOLVE_DIFF_ORDER_NB],
+                     const double *rhs, double *coeffs);
+
+#endif /* MD_PSSOLVE_H */
diff --git a/src/register.c b/src/register.c
new file mode 100644
index 0000000..64b47ce
--- /dev/null
+++ b/src/register.c
@@ -0,0 +1,7 @@
+void minimal_distortion_axi_register_mol(CCTK_ARGUMENTS)
+{
+    MoLRegisterConstrained(CCTK_VarIndex("ML_BSSN::beta1"));
+    MoLRegisterConstrained(CCTK_VarIndex("ML_BSSN::beta2"));
+    MoLRegisterConstrained(CCTK_VarIndex("ML_BSSN::beta3"));
+}
+
diff --git a/src/threadpool.c b/src/threadpool.c
new file mode 100644
index 0000000..2febdcb
--- /dev/null
+++ b/src/threadpool.c
@@ -0,0 +1,174 @@
+/*
+ * Copyright 2016 Anton Khirnov <anton@khirnov.net>
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <errno.h>
+#include <pthread.h>
+#include <stdlib.h>
+
+#include "threadpool.h"
+
+typedef struct WorkerContext {
+    ThreadPoolContext *parent;
+    pthread_t       thread;
+    unsigned int idx;
+} WorkerContext;
+
+struct ThreadPoolContext {
+    WorkerContext  *workers;
+    unsigned int nb_workers;
+
+    pthread_mutex_t mutex;
+    pthread_cond_t  cond;
+    void (*func)(void *arg,
+                 unsigned int job_idx,    unsigned int nb_jobs,
+                 unsigned int thread_idx, unsigned int nb_threads);
+    void *func_arg;
+    int next_job;
+    int nb_jobs;
+    int nb_jobs_finished;
+
+    int finish;
+};
+
+void *worker_thread(void *arg)
+{
+    WorkerContext *w = arg;
+    ThreadPoolContext *ctx = w->parent;
+    int nb_jobs, job_idx;
+
+    while (1) {
+        pthread_mutex_lock(&ctx->mutex);
+        while (!ctx->finish && ctx->next_job >= ctx->nb_jobs)
+            pthread_cond_wait(&ctx->cond, &ctx->mutex);
+
+        if (ctx->finish) {
+            pthread_mutex_unlock(&ctx->mutex);
+            break;
+        }
+
+        nb_jobs = ctx->nb_jobs;
+        job_idx = ctx->next_job++;
+
+        pthread_mutex_unlock(&ctx->mutex);
+
+        ctx->func(ctx->func_arg, job_idx, nb_jobs, w->idx, ctx->nb_workers);
+
+        pthread_mutex_lock(&ctx->mutex);
+
+        ctx->nb_jobs_finished++;
+
+        pthread_cond_broadcast(&ctx->cond);
+        pthread_mutex_unlock(&ctx->mutex);
+    }
+    return NULL;
+}
+
+int md_threadpool_init(ThreadPoolContext **pctx, unsigned int nb_threads)
+{
+    ThreadPoolContext *ctx;
+    int ret = 0;
+
+    if (!nb_threads)
+        return -ENOSYS;
+
+    ctx = calloc(1, sizeof(*ctx));
+    if (!ctx)
+        return -ENOMEM;
+
+    pthread_mutex_init(&ctx->mutex, NULL);
+    pthread_cond_init(&ctx->cond, NULL);
+
+    ctx->workers = calloc(nb_threads, sizeof(*ctx->workers));
+    if (!ctx->workers) {
+        ret = -ENOMEM;
+        goto fail;
+    }
+
+    for (int i = 0; i < nb_threads; i++) {
+        WorkerContext *w = &ctx->workers[i];
+
+        w->idx        = i;
+        w->parent     = ctx;
+
+        ret = pthread_create(&w->thread, NULL, worker_thread, w);
+        if (ret) {
+            ret = -ret;
+            goto fail;
+        }
+
+        ctx->nb_workers++;
+    }
+
+
+    *pctx = ctx;
+    return 0;
+fail:
+    md_threadpool_free(&ctx);
+    return ret;
+}
+
+void md_threadpool_free(ThreadPoolContext **pctx)
+{
+    ThreadPoolContext *ctx = *pctx;
+
+    if (!ctx)
+        return;
+
+    pthread_mutex_lock(&ctx->mutex);
+    ctx->finish = 1;
+    pthread_cond_broadcast(&ctx->cond);
+    pthread_mutex_unlock(&ctx->mutex);
+
+
+    for (int i = 0; i < ctx->nb_workers; i++) {
+        WorkerContext *w = &ctx->workers[i];
+        pthread_join(w->thread, NULL);
+    }
+
+    pthread_mutex_destroy(&ctx->mutex);
+    pthread_cond_destroy(&ctx->cond);
+
+    free(ctx->workers);
+
+    free(ctx);
+    *pctx = NULL;
+}
+
+void md_threadpool_execute(ThreadPoolContext *ctx, unsigned int nb_jobs,
+                            void (*func)(void *arg,
+                                         unsigned int job_idx,    unsigned int nb_jobs,
+                                         unsigned int thread_idx, unsigned int nb_threads),
+                            void *arg)
+{
+    pthread_mutex_lock(&ctx->mutex);
+
+    ctx->func     = func;
+    ctx->func_arg = arg;
+
+    ctx->nb_jobs          = nb_jobs;
+    ctx->nb_jobs_finished = 0;
+    ctx->next_job         = 0;
+
+    pthread_cond_broadcast(&ctx->cond);
+    while (ctx->nb_jobs_finished < ctx->nb_jobs)
+        pthread_cond_wait(&ctx->cond, &ctx->mutex);
+
+    ctx->func     = NULL;
+    ctx->func_arg = NULL;
+
+    pthread_mutex_unlock(&ctx->mutex);
+}
diff --git a/src/threadpool.h b/src/threadpool.h
new file mode 100644
index 0000000..0f6896d
--- /dev/null
+++ b/src/threadpool.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright 2016 Anton Khirnov <anton@khirnov.net>
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef MD_THREADPOOL_H
+#define MD_THREADPOOL_H
+
+typedef struct ThreadPoolContext ThreadPoolContext;
+
+int md_threadpool_init(ThreadPoolContext **ctx, unsigned int nb_threads);
+void md_threadpool_free(ThreadPoolContext **ctx);
+
+void md_threadpool_execute(ThreadPoolContext *ctx, unsigned int nb_jobs,
+                           void (*func)(void *arg,
+                                        unsigned int job_idx,    unsigned int nb_jobs,
+                                        unsigned int thread_idx, unsigned int nb_threads),
+                           void *arg);
+
+#endif /* MD_THREADPOOL_H */
diff --git a/src/x86inc.asm b/src/x86inc.asm
new file mode 100644
index 0000000..dca1f78
--- /dev/null
+++ b/src/x86inc.asm
@@ -0,0 +1,1544 @@
+;*****************************************************************************
+;* x86inc.asm: x264asm abstraction layer
+;*****************************************************************************
+;* Copyright (C) 2005-2016 x264 project
+;*
+;* Authors: Loren Merritt <lorenm@u.washington.edu>
+;*          Anton Mitrofanov <BugMaster@narod.ru>
+;*          Fiona Glaser <fiona@x264.com>
+;*          Henrik Gramner <henrik@gramner.com>
+;*
+;* Permission to use, copy, modify, and/or distribute this software for any
+;* purpose with or without fee is hereby granted, provided that the above
+;* copyright notice and this permission notice appear in all copies.
+;*
+;* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+;* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+;* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+;* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+;* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+;* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+;*****************************************************************************
+
+; This is a header file for the x264ASM assembly language, which uses
+; NASM/YASM syntax combined with a large number of macros to provide easy
+; abstraction between different calling conventions (x86_32, win64, linux64).
+; It also has various other useful features to simplify writing the kind of
+; DSP functions that are most often used in x264.
+
+; Unlike the rest of x264, this file is available under an ISC license, as it
+; has significant usefulness outside of x264 and we want it to be available
+; to the largest audience possible.  Of course, if you modify it for your own
+; purposes to add a new feature, we strongly encourage contributing a patch
+; as this feature might be useful for others as well.  Send patches or ideas
+; to x264-devel@videolan.org .
+
+%ifndef private_prefix
+    %define private_prefix x264
+%endif
+
+%ifndef public_prefix
+    %define public_prefix private_prefix
+%endif
+
+%if HAVE_ALIGNED_STACK
+    %define STACK_ALIGNMENT 16
+%endif
+%ifndef STACK_ALIGNMENT
+    %if ARCH_X86_64
+        %define STACK_ALIGNMENT 16
+    %else
+        %define STACK_ALIGNMENT 4
+    %endif
+%endif
+
+%define WIN64  0
+%define UNIX64 0
+%if ARCH_X86_64
+    %ifidn __OUTPUT_FORMAT__,win32
+        %define WIN64  1
+    %elifidn __OUTPUT_FORMAT__,win64
+        %define WIN64  1
+    %elifidn __OUTPUT_FORMAT__,x64
+        %define WIN64  1
+    %else
+        %define UNIX64 1
+    %endif
+%endif
+
+%define FORMAT_ELF 0
+%ifidn __OUTPUT_FORMAT__,elf
+    %define FORMAT_ELF 1
+%elifidn __OUTPUT_FORMAT__,elf32
+    %define FORMAT_ELF 1
+%elifidn __OUTPUT_FORMAT__,elf64
+    %define FORMAT_ELF 1
+%endif
+
+%ifdef PREFIX
+    %define mangle(x) _ %+ x
+%else
+    %define mangle(x) x
+%endif
+
+; aout does not support align=
+; NOTE: This section is out of sync with x264, in order to
+; keep supporting OS/2.
+%macro SECTION_RODATA 0-1 16
+    %ifidn __OUTPUT_FORMAT__,aout
+        section .text
+    %else
+        SECTION .rodata align=%1
+    %endif
+%endmacro
+
+%if WIN64
+    %define PIC
+%elif ARCH_X86_64 == 0
+; x86_32 doesn't require PIC.
+; Some distros prefer shared objects to be PIC, but nothing breaks if
+; the code contains a few textrels, so we'll skip that complexity.
+    %undef PIC
+%endif
+%ifdef PIC
+    default rel
+%endif
+
+%macro CPUNOP 1
+    %if HAVE_CPUNOP
+        CPU %1
+    %endif
+%endmacro
+
+; Macros to eliminate most code duplication between x86_32 and x86_64:
+; Currently this works only for leaf functions which load all their arguments
+; into registers at the start, and make no other use of the stack. Luckily that
+; covers most of x264's asm.
+
+; PROLOGUE:
+; %1 = number of arguments. loads them from stack if needed.
+; %2 = number of registers used. pushes callee-saved regs if needed.
+; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed.
+; %4 = (optional) stack size to be allocated. The stack will be aligned before
+;      allocating the specified stack size. If the required stack alignment is
+;      larger than the known stack alignment the stack will be manually aligned
+;      and an extra register will be allocated to hold the original stack
+;      pointer (to not invalidate r0m etc.). To prevent the use of an extra
+;      register as stack pointer, request a negative stack size.
+; %4+/%5+ = list of names to define to registers
+; PROLOGUE can also be invoked by adding the same options to cglobal
+
+; e.g.
+; cglobal foo, 2,3,7,0x40, dst, src, tmp
+; declares a function (foo) that automatically loads two arguments (dst and
+; src) into registers, uses one additional register (tmp) plus 7 vector
+; registers (m0-m6) and allocates 0x40 bytes of stack space.
+
+; TODO Some functions can use some args directly from the stack. If they're the
+; last args then you can just not declare them, but if they're in the middle
+; we need more flexible macro.
+
+; RET:
+; Pops anything that was pushed by PROLOGUE, and returns.
+
+; REP_RET:
+; Use this instead of RET if it's a branch target.
+
+; registers:
+; rN and rNq are the native-size register holding function argument N
+; rNd, rNw, rNb are dword, word, and byte size
+; rNh is the high 8 bits of the word size
+; rNm is the original location of arg N (a register or on the stack), dword
+; rNmp is native size
+
+%macro DECLARE_REG 2-3
+    %define r%1q %2
+    %define r%1d %2d
+    %define r%1w %2w
+    %define r%1b %2b
+    %define r%1h %2h
+    %define %2q %2
+    %if %0 == 2
+        %define r%1m  %2d
+        %define r%1mp %2
+    %elif ARCH_X86_64 ; memory
+        %define r%1m [rstk + stack_offset + %3]
+        %define r%1mp qword r %+ %1 %+ m
+    %else
+        %define r%1m [rstk + stack_offset + %3]
+        %define r%1mp dword r %+ %1 %+ m
+    %endif
+    %define r%1  %2
+%endmacro
+
+%macro DECLARE_REG_SIZE 3
+    %define r%1q r%1
+    %define e%1q r%1
+    %define r%1d e%1
+    %define e%1d e%1
+    %define r%1w %1
+    %define e%1w %1
+    %define r%1h %3
+    %define e%1h %3
+    %define r%1b %2
+    %define e%1b %2
+    %if ARCH_X86_64 == 0
+        %define r%1 e%1
+    %endif
+%endmacro
+
+DECLARE_REG_SIZE ax, al, ah
+DECLARE_REG_SIZE bx, bl, bh
+DECLARE_REG_SIZE cx, cl, ch
+DECLARE_REG_SIZE dx, dl, dh
+DECLARE_REG_SIZE si, sil, null
+DECLARE_REG_SIZE di, dil, null
+DECLARE_REG_SIZE bp, bpl, null
+
+; t# defines for when per-arch register allocation is more complex than just function arguments
+
+%macro DECLARE_REG_TMP 1-*
+    %assign %%i 0
+    %rep %0
+        CAT_XDEFINE t, %%i, r%1
+        %assign %%i %%i+1
+        %rotate 1
+    %endrep
+%endmacro
+
+%macro DECLARE_REG_TMP_SIZE 0-*
+    %rep %0
+        %define t%1q t%1 %+ q
+        %define t%1d t%1 %+ d
+        %define t%1w t%1 %+ w
+        %define t%1h t%1 %+ h
+        %define t%1b t%1 %+ b
+        %rotate 1
+    %endrep
+%endmacro
+
+DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
+
+%if ARCH_X86_64
+    %define gprsize 8
+%else
+    %define gprsize 4
+%endif
+
+%macro PUSH 1
+    push %1
+    %ifidn rstk, rsp
+        %assign stack_offset stack_offset+gprsize
+    %endif
+%endmacro
+
+%macro POP 1
+    pop %1
+    %ifidn rstk, rsp
+        %assign stack_offset stack_offset-gprsize
+    %endif
+%endmacro
+
+%macro PUSH_IF_USED 1-*
+    %rep %0
+        %if %1 < regs_used
+            PUSH r%1
+        %endif
+        %rotate 1
+    %endrep
+%endmacro
+
+%macro POP_IF_USED 1-*
+    %rep %0
+        %if %1 < regs_used
+            pop r%1
+        %endif
+        %rotate 1
+    %endrep
+%endmacro
+
+%macro LOAD_IF_USED 1-*
+    %rep %0
+        %if %1 < num_args
+            mov r%1, r %+ %1 %+ mp
+        %endif
+        %rotate 1
+    %endrep
+%endmacro
+
+%macro SUB 2
+    sub %1, %2
+    %ifidn %1, rstk
+        %assign stack_offset stack_offset+(%2)
+    %endif
+%endmacro
+
+%macro ADD 2
+    add %1, %2
+    %ifidn %1, rstk
+        %assign stack_offset stack_offset-(%2)
+    %endif
+%endmacro
+
+%macro movifnidn 2
+    %ifnidn %1, %2
+        mov %1, %2
+    %endif
+%endmacro
+
+%macro movsxdifnidn 2
+    %ifnidn %1, %2
+        movsxd %1, %2
+    %endif
+%endmacro
+
+%macro ASSERT 1
+    %if (%1) == 0
+        %error assertion ``%1'' failed
+    %endif
+%endmacro
+
+%macro DEFINE_ARGS 0-*
+    %ifdef n_arg_names
+        %assign %%i 0
+        %rep n_arg_names
+            CAT_UNDEF arg_name %+ %%i, q
+            CAT_UNDEF arg_name %+ %%i, d
+            CAT_UNDEF arg_name %+ %%i, w
+            CAT_UNDEF arg_name %+ %%i, h
+            CAT_UNDEF arg_name %+ %%i, b
+            CAT_UNDEF arg_name %+ %%i, m
+            CAT_UNDEF arg_name %+ %%i, mp
+            CAT_UNDEF arg_name, %%i
+            %assign %%i %%i+1
+        %endrep
+    %endif
+
+    %xdefine %%stack_offset stack_offset
+    %undef stack_offset ; so that the current value of stack_offset doesn't get baked in by xdefine
+    %assign %%i 0
+    %rep %0
+        %xdefine %1q r %+ %%i %+ q
+        %xdefine %1d r %+ %%i %+ d
+        %xdefine %1w r %+ %%i %+ w
+        %xdefine %1h r %+ %%i %+ h
+        %xdefine %1b r %+ %%i %+ b
+        %xdefine %1m r %+ %%i %+ m
+        %xdefine %1mp r %+ %%i %+ mp
+        CAT_XDEFINE arg_name, %%i, %1
+        %assign %%i %%i+1
+        %rotate 1
+    %endrep
+    %xdefine stack_offset %%stack_offset
+    %assign n_arg_names %0
+%endmacro
+
+%define required_stack_alignment ((mmsize + 15) & ~15)
+
+%macro ALLOC_STACK 1-2 0 ; stack_size, n_xmm_regs (for win64 only)
+    %ifnum %1
+        %if %1 != 0
+            %assign %%pad 0
+            %assign stack_size %1
+            %if stack_size < 0
+                %assign stack_size -stack_size
+            %endif
+            %if WIN64
+                %assign %%pad %%pad + 32 ; shadow space
+                %if mmsize != 8
+                    %assign xmm_regs_used %2
+                    %if xmm_regs_used > 8
+                        %assign %%pad %%pad + (xmm_regs_used-8)*16 ; callee-saved xmm registers
+                    %endif
+                %endif
+            %endif
+            %if required_stack_alignment <= STACK_ALIGNMENT
+                ; maintain the current stack alignment
+                %assign stack_size_padded stack_size + %%pad + ((-%%pad-stack_offset-gprsize) & (STACK_ALIGNMENT-1))
+                SUB rsp, stack_size_padded
+            %else
+                %assign %%reg_num (regs_used - 1)
+                %xdefine rstk r %+ %%reg_num
+                ; align stack, and save original stack location directly above
+                ; it, i.e. in [rsp+stack_size_padded], so we can restore the
+                ; stack in a single instruction (i.e. mov rsp, rstk or mov
+                ; rsp, [rsp+stack_size_padded])
+                %if %1 < 0 ; need to store rsp on stack
+                    %xdefine rstkm [rsp + stack_size + %%pad]
+                    %assign %%pad %%pad + gprsize
+                %else ; can keep rsp in rstk during whole function
+                    %xdefine rstkm rstk
+                %endif
+                %assign stack_size_padded stack_size + ((%%pad + required_stack_alignment-1) & ~(required_stack_alignment-1))
+                mov rstk, rsp
+                and rsp, ~(required_stack_alignment-1)
+                sub rsp, stack_size_padded
+                movifnidn rstkm, rstk
+            %endif
+            WIN64_PUSH_XMM
+        %endif
+    %endif
+%endmacro
+
+%macro SETUP_STACK_POINTER 1
+    %ifnum %1
+        %if %1 != 0 && required_stack_alignment > STACK_ALIGNMENT
+            %if %1 > 0
+                %assign regs_used (regs_used + 1)
+            %endif
+            %if ARCH_X86_64 && regs_used < 5 + UNIX64 * 3
+                ; Ensure that we don't clobber any registers containing arguments. For UNIX64 we also preserve r6 (rax)
+                ; since it's used as a hidden argument in vararg functions to specify the number of vector registers used.
+                %assign regs_used 5 + UNIX64 * 3
+            %endif
+        %endif
+    %endif
+%endmacro
+
+%macro DEFINE_ARGS_INTERNAL 3+
+    %ifnum %2
+        DEFINE_ARGS %3
+    %elif %1 == 4
+        DEFINE_ARGS %2
+    %elif %1 > 4
+        DEFINE_ARGS %2, %3
+    %endif
+%endmacro
+
+%if WIN64 ; Windows x64 ;=================================================
+
+DECLARE_REG 0,  rcx
+DECLARE_REG 1,  rdx
+DECLARE_REG 2,  R8
+DECLARE_REG 3,  R9
+DECLARE_REG 4,  R10, 40
+DECLARE_REG 5,  R11, 48
+DECLARE_REG 6,  rax, 56
+DECLARE_REG 7,  rdi, 64
+DECLARE_REG 8,  rsi, 72
+DECLARE_REG 9,  rbx, 80
+DECLARE_REG 10, rbp, 88
+DECLARE_REG 11, R12, 96
+DECLARE_REG 12, R13, 104
+DECLARE_REG 13, R14, 112
+DECLARE_REG 14, R15, 120
+
+%macro PROLOGUE 2-5+ 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
+    %assign num_args %1
+    %assign regs_used %2
+    ASSERT regs_used >= num_args
+    SETUP_STACK_POINTER %4
+    ASSERT regs_used <= 15
+    PUSH_IF_USED 7, 8, 9, 10, 11, 12, 13, 14
+    ALLOC_STACK %4, %3
+    %if mmsize != 8 && stack_size == 0
+        WIN64_SPILL_XMM %3
+    %endif
+    LOAD_IF_USED 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14
+    DEFINE_ARGS_INTERNAL %0, %4, %5
+%endmacro
+
+%macro WIN64_PUSH_XMM 0
+    ; Use the shadow space to store XMM6 and XMM7, the rest needs stack space allocated.
+    %if xmm_regs_used > 6
+        movaps [rstk + stack_offset +  8], xmm6
+    %endif
+    %if xmm_regs_used > 7
+        movaps [rstk + stack_offset + 24], xmm7
+    %endif
+    %if xmm_regs_used > 8
+        %assign %%i 8
+        %rep xmm_regs_used-8
+            movaps [rsp + (%%i-8)*16 + stack_size + 32], xmm %+ %%i
+            %assign %%i %%i+1
+        %endrep
+    %endif
+%endmacro
+
+%macro WIN64_SPILL_XMM 1
+    %assign xmm_regs_used %1
+    ASSERT xmm_regs_used <= 16
+    %if xmm_regs_used > 8
+        ; Allocate stack space for callee-saved xmm registers plus shadow space and align the stack.
+        %assign %%pad (xmm_regs_used-8)*16 + 32
+        %assign stack_size_padded %%pad + ((-%%pad-stack_offset-gprsize) & (STACK_ALIGNMENT-1))
+        SUB rsp, stack_size_padded
+    %endif
+    WIN64_PUSH_XMM
+%endmacro
+
+%macro WIN64_RESTORE_XMM_INTERNAL 1
+    %assign %%pad_size 0
+    %if xmm_regs_used > 8
+        %assign %%i xmm_regs_used
+        %rep xmm_regs_used-8
+            %assign %%i %%i-1
+            movaps xmm %+ %%i, [%1 + (%%i-8)*16 + stack_size + 32]
+        %endrep
+    %endif
+    %if stack_size_padded > 0
+        %if stack_size > 0 && required_stack_alignment > STACK_ALIGNMENT
+            mov rsp, rstkm
+        %else
+            add %1, stack_size_padded
+            %assign %%pad_size stack_size_padded
+        %endif
+    %endif
+    %if xmm_regs_used > 7
+        movaps xmm7, [%1 + stack_offset - %%pad_size + 24]
+    %endif
+    %if xmm_regs_used > 6
+        movaps xmm6, [%1 + stack_offset - %%pad_size +  8]
+    %endif
+%endmacro
+
+%macro WIN64_RESTORE_XMM 1
+    WIN64_RESTORE_XMM_INTERNAL %1
+    %assign stack_offset (stack_offset-stack_size_padded)
+    %assign xmm_regs_used 0
+%endmacro
+
+%define has_epilogue regs_used > 7 || xmm_regs_used > 6 || mmsize == 32 || stack_size > 0
+
+%macro RET 0
+    WIN64_RESTORE_XMM_INTERNAL rsp
+    POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7
+    %if mmsize == 32
+        vzeroupper
+    %endif
+    AUTO_REP_RET
+%endmacro
+
+%elif ARCH_X86_64 ; *nix x64 ;=============================================
+
+DECLARE_REG 0,  rdi
+DECLARE_REG 1,  rsi
+DECLARE_REG 2,  rdx
+DECLARE_REG 3,  rcx
+DECLARE_REG 4,  R8
+DECLARE_REG 5,  R9
+DECLARE_REG 6,  rax, 8
+DECLARE_REG 7,  R10, 16
+DECLARE_REG 8,  R11, 24
+DECLARE_REG 9,  rbx, 32
+DECLARE_REG 10, rbp, 40
+DECLARE_REG 11, R12, 48
+DECLARE_REG 12, R13, 56
+DECLARE_REG 13, R14, 64
+DECLARE_REG 14, R15, 72
+
+%macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
+    %assign num_args %1
+    %assign regs_used %2
+    ASSERT regs_used >= num_args
+    SETUP_STACK_POINTER %4
+    ASSERT regs_used <= 15
+    PUSH_IF_USED 9, 10, 11, 12, 13, 14
+    ALLOC_STACK %4
+    LOAD_IF_USED 6, 7, 8, 9, 10, 11, 12, 13, 14
+    DEFINE_ARGS_INTERNAL %0, %4, %5
+%endmacro
+
+%define has_epilogue regs_used > 9 || mmsize == 32 || stack_size > 0
+
+%macro RET 0
+    %if stack_size_padded > 0
+        %if required_stack_alignment > STACK_ALIGNMENT
+            mov rsp, rstkm
+        %else
+            add rsp, stack_size_padded
+        %endif
+    %endif
+    POP_IF_USED 14, 13, 12, 11, 10, 9
+    %if mmsize == 32
+        vzeroupper
+    %endif
+    AUTO_REP_RET
+%endmacro
+
+%else ; X86_32 ;==============================================================
+
+DECLARE_REG 0, eax, 4
+DECLARE_REG 1, ecx, 8
+DECLARE_REG 2, edx, 12
+DECLARE_REG 3, ebx, 16
+DECLARE_REG 4, esi, 20
+DECLARE_REG 5, edi, 24
+DECLARE_REG 6, ebp, 28
+%define rsp esp
+
+%macro DECLARE_ARG 1-*
+    %rep %0
+        %define r%1m [rstk + stack_offset + 4*%1 + 4]
+        %define r%1mp dword r%1m
+        %rotate 1
+    %endrep
+%endmacro
+
+DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
+
+%macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
+    %assign num_args %1
+    %assign regs_used %2
+    ASSERT regs_used >= num_args
+    %if num_args > 7
+        %assign num_args 7
+    %endif
+    %if regs_used > 7
+        %assign regs_used 7
+    %endif
+    SETUP_STACK_POINTER %4
+    ASSERT regs_used <= 7
+    PUSH_IF_USED 3, 4, 5, 6
+    ALLOC_STACK %4
+    LOAD_IF_USED 0, 1, 2, 3, 4, 5, 6
+    DEFINE_ARGS_INTERNAL %0, %4, %5
+%endmacro
+
+%define has_epilogue regs_used > 3 || mmsize == 32 || stack_size > 0
+
+%macro RET 0
+    %if stack_size_padded > 0
+        %if required_stack_alignment > STACK_ALIGNMENT
+            mov rsp, rstkm
+        %else
+            add rsp, stack_size_padded
+        %endif
+    %endif
+    POP_IF_USED 6, 5, 4, 3
+    %if mmsize == 32
+        vzeroupper
+    %endif
+    AUTO_REP_RET
+%endmacro
+
+%endif ;======================================================================
+
+%if WIN64 == 0
+    %macro WIN64_SPILL_XMM 1
+    %endmacro
+    %macro WIN64_RESTORE_XMM 1
+    %endmacro
+    %macro WIN64_PUSH_XMM 0
+    %endmacro
+%endif
+
+; On AMD cpus <=K10, an ordinary ret is slow if it immediately follows either
+; a branch or a branch target. So switch to a 2-byte form of ret in that case.
+; We can automatically detect "follows a branch", but not a branch target.
+; (SSSE3 is a sufficient condition to know that your cpu doesn't have this problem.)
+%macro REP_RET 0
+    %if has_epilogue
+        RET
+    %else
+        rep ret
+    %endif
+    annotate_function_size
+%endmacro
+
+%define last_branch_adr $$
+%macro AUTO_REP_RET 0
+    %if notcpuflag(ssse3)
+        times ((last_branch_adr-$)>>31)+1 rep ; times 1 iff $ == last_branch_adr.
+    %endif
+    ret
+    annotate_function_size
+%endmacro
+
+%macro BRANCH_INSTR 0-*
+    %rep %0
+        %macro %1 1-2 %1
+            %2 %1
+            %if notcpuflag(ssse3)
+                %%branch_instr equ $
+                %xdefine last_branch_adr %%branch_instr
+            %endif
+        %endmacro
+        %rotate 1
+    %endrep
+%endmacro
+
+BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae, jna, jnae, jb, jbe, jnb, jnbe, jc, jnc, js, jns, jo, jno, jp, jnp
+
+%macro TAIL_CALL 2 ; callee, is_nonadjacent
+    %if has_epilogue
+        call %1
+        RET
+    %elif %2
+        jmp %1
+    %endif
+    annotate_function_size
+%endmacro
+
+;=============================================================================
+; arch-independent part
+;=============================================================================
+
+%assign function_align 16
+
+; Begin a function.
+; Applies any symbol mangling needed for C linkage, and sets up a define such that
+; subsequent uses of the function name automatically refer to the mangled version.
+; Appends cpuflags to the function name if cpuflags has been specified.
+; The "" empty default parameter is a workaround for nasm, which fails if SUFFIX
+; is empty and we call cglobal_internal with just %1 %+ SUFFIX (without %2).
+%macro cglobal 1-2+ "" ; name, [PROLOGUE args]
+    cglobal_internal 1, %1 %+ SUFFIX, %2
+%endmacro
+%macro cvisible 1-2+ "" ; name, [PROLOGUE args]
+    cglobal_internal 0, %1 %+ SUFFIX, %2
+%endmacro
+%macro cglobal_internal 2-3+
+    annotate_function_size
+    %if %1
+        %xdefine %%FUNCTION_PREFIX private_prefix
+        %xdefine %%VISIBILITY hidden
+    %else
+        %xdefine %%FUNCTION_PREFIX public_prefix
+        %xdefine %%VISIBILITY
+    %endif
+    %ifndef cglobaled_%2
+        %xdefine %2 mangle(%%FUNCTION_PREFIX %+ _ %+ %2)
+        %xdefine %2.skip_prologue %2 %+ .skip_prologue
+        CAT_XDEFINE cglobaled_, %2, 1
+    %endif
+    %xdefine current_function %2
+    %xdefine current_function_section __SECT__
+    %if FORMAT_ELF
+        global %2:function %%VISIBILITY
+    %else
+        global %2
+    %endif
+    align function_align
+    %2:
+    RESET_MM_PERMUTATION        ; needed for x86-64, also makes disassembly somewhat nicer
+    %xdefine rstk rsp           ; copy of the original stack pointer, used when greater alignment than the known stack alignment is required
+    %assign stack_offset 0      ; stack pointer offset relative to the return address
+    %assign stack_size 0        ; amount of stack space that can be freely used inside a function
+    %assign stack_size_padded 0 ; total amount of allocated stack space, including space for callee-saved xmm registers on WIN64 and alignment padding
+    %assign xmm_regs_used 0     ; number of XMM registers requested, used for dealing with callee-saved registers on WIN64
+    %ifnidn %3, ""
+        PROLOGUE %3
+    %endif
+%endmacro
+
+%macro cextern 1
+    %xdefine %1 mangle(private_prefix %+ _ %+ %1)
+    CAT_XDEFINE cglobaled_, %1, 1
+    extern %1
+%endmacro
+
+; like cextern, but without the prefix
+%macro cextern_naked 1
+    %ifdef PREFIX
+        %xdefine %1 mangle(%1)
+    %endif
+    CAT_XDEFINE cglobaled_, %1, 1
+    extern %1
+%endmacro
+
+%macro const 1-2+
+    %xdefine %1 mangle(private_prefix %+ _ %+ %1)
+    %if FORMAT_ELF
+        global %1:data hidden
+    %else
+        global %1
+    %endif
+    %1: %2
+%endmacro
+
+; This is needed for ELF, otherwise the GNU linker assumes the stack is executable by default.
+%if FORMAT_ELF
+    [SECTION .note.GNU-stack noalloc noexec nowrite progbits]
+%endif
+
+; Tell debuggers how large the function was.
+; This may be invoked multiple times per function; we rely on later instances overriding earlier ones.
+; This is invoked by RET and similar macros, and also cglobal does it for the previous function,
+; but if the last function in a source file doesn't use any of the standard macros for its epilogue,
+; then its size might be unspecified.
+%macro annotate_function_size 0
+    %ifdef __YASM_VER__
+        %ifdef current_function
+            %if FORMAT_ELF
+                current_function_section
+                %%ecf equ $
+                size current_function %%ecf - current_function
+                __SECT__
+            %endif
+        %endif
+    %endif
+%endmacro
+
+; cpuflags
+
+%assign cpuflags_mmx      (1<<0)
+%assign cpuflags_mmx2     (1<<1) | cpuflags_mmx
+%assign cpuflags_3dnow    (1<<2) | cpuflags_mmx
+%assign cpuflags_3dnowext (1<<3) | cpuflags_3dnow
+%assign cpuflags_sse      (1<<4) | cpuflags_mmx2
+%assign cpuflags_sse2     (1<<5) | cpuflags_sse
+%assign cpuflags_sse2slow (1<<6) | cpuflags_sse2
+%assign cpuflags_sse3     (1<<7) | cpuflags_sse2
+%assign cpuflags_ssse3    (1<<8) | cpuflags_sse3
+%assign cpuflags_sse4     (1<<9) | cpuflags_ssse3
+%assign cpuflags_sse42    (1<<10)| cpuflags_sse4
+%assign cpuflags_avx      (1<<11)| cpuflags_sse42
+%assign cpuflags_xop      (1<<12)| cpuflags_avx
+%assign cpuflags_fma4     (1<<13)| cpuflags_avx
+%assign cpuflags_fma3     (1<<14)| cpuflags_avx
+%assign cpuflags_avx2     (1<<15)| cpuflags_fma3
+
+%assign cpuflags_cache32  (1<<16)
+%assign cpuflags_cache64  (1<<17)
+%assign cpuflags_slowctz  (1<<18)
+%assign cpuflags_lzcnt    (1<<19)
+%assign cpuflags_aligned  (1<<20) ; not a cpu feature, but a function variant
+%assign cpuflags_atom     (1<<21)
+%assign cpuflags_bmi1     (1<<22)|cpuflags_lzcnt
+%assign cpuflags_bmi2     (1<<23)|cpuflags_bmi1
+
+; Returns a boolean value expressing whether or not the specified cpuflag is enabled.
+%define    cpuflag(x) (((((cpuflags & (cpuflags_ %+ x)) ^ (cpuflags_ %+ x)) - 1) >> 31) & 1)
+%define notcpuflag(x) (cpuflag(x) ^ 1)
+
+; Takes an arbitrary number of cpuflags from the above list.
+; All subsequent functions (up to the next INIT_CPUFLAGS) is built for the specified cpu.
+; You shouldn't need to invoke this macro directly, it's a subroutine for INIT_MMX &co.
+%macro INIT_CPUFLAGS 0-*
+    %xdefine SUFFIX
+    %undef cpuname
+    %assign cpuflags 0
+
+    %if %0 >= 1
+        %rep %0
+            %ifdef cpuname
+                %xdefine cpuname cpuname %+ _%1
+            %else
+                %xdefine cpuname %1
+            %endif
+            %assign cpuflags cpuflags | cpuflags_%1
+            %rotate 1
+        %endrep
+        %xdefine SUFFIX _ %+ cpuname
+
+        %if cpuflag(avx)
+            %assign avx_enabled 1
+        %endif
+        %if (mmsize == 16 && notcpuflag(sse2)) || (mmsize == 32 && notcpuflag(avx2))
+            %define mova movaps
+            %define movu movups
+            %define movnta movntps
+        %endif
+        %if cpuflag(aligned)
+            %define movu mova
+        %elif cpuflag(sse3) && notcpuflag(ssse3)
+            %define movu lddqu
+        %endif
+    %endif
+
+    %if ARCH_X86_64 || cpuflag(sse2)
+        CPUNOP amdnop
+    %else
+        CPUNOP basicnop
+    %endif
+%endmacro
+
+; Merge mmx and sse*
+; m# is a simd register of the currently selected size
+; xm# is the corresponding xmm register if mmsize >= 16, otherwise the same as m#
+; ym# is the corresponding ymm register if mmsize >= 32, otherwise the same as m#
+; (All 3 remain in sync through SWAP.)
+
+%macro CAT_XDEFINE 3
+    %xdefine %1%2 %3
+%endmacro
+
+%macro CAT_UNDEF 2
+    %undef %1%2
+%endmacro
+
+%macro INIT_MMX 0-1+
+    %assign avx_enabled 0
+    %define RESET_MM_PERMUTATION INIT_MMX %1
+    %define mmsize 8
+    %define num_mmregs 8
+    %define mova movq
+    %define movu movq
+    %define movh movd
+    %define movnta movntq
+    %assign %%i 0
+    %rep 8
+        CAT_XDEFINE m, %%i, mm %+ %%i
+        CAT_XDEFINE nnmm, %%i, %%i
+        %assign %%i %%i+1
+    %endrep
+    %rep 8
+        CAT_UNDEF m, %%i
+        CAT_UNDEF nnmm, %%i
+        %assign %%i %%i+1
+    %endrep
+    INIT_CPUFLAGS %1
+%endmacro
+
+%macro INIT_XMM 0-1+
+    %assign avx_enabled 0
+    %define RESET_MM_PERMUTATION INIT_XMM %1
+    %define mmsize 16
+    %define num_mmregs 8
+    %if ARCH_X86_64
+        %define num_mmregs 16
+    %endif
+    %define mova movdqa
+    %define movu movdqu
+    %define movh movq
+    %define movnta movntdq
+    %assign %%i 0
+    %rep num_mmregs
+        CAT_XDEFINE m, %%i, xmm %+ %%i
+        CAT_XDEFINE nnxmm, %%i, %%i
+        %assign %%i %%i+1
+    %endrep
+    INIT_CPUFLAGS %1
+%endmacro
+
+%macro INIT_YMM 0-1+
+    %assign avx_enabled 1
+    %define RESET_MM_PERMUTATION INIT_YMM %1
+    %define mmsize 32
+    %define num_mmregs 8
+    %if ARCH_X86_64
+        %define num_mmregs 16
+    %endif
+    %define mova movdqa
+    %define movu movdqu
+    %undef movh
+    %define movnta movntdq
+    %assign %%i 0
+    %rep num_mmregs
+        CAT_XDEFINE m, %%i, ymm %+ %%i
+        CAT_XDEFINE nnymm, %%i, %%i
+        %assign %%i %%i+1
+    %endrep
+    INIT_CPUFLAGS %1
+%endmacro
+
+INIT_XMM
+
+%macro DECLARE_MMCAST 1
+    %define  mmmm%1   mm%1
+    %define  mmxmm%1  mm%1
+    %define  mmymm%1  mm%1
+    %define xmmmm%1   mm%1
+    %define xmmxmm%1 xmm%1
+    %define xmmymm%1 xmm%1
+    %define ymmmm%1   mm%1
+    %define ymmxmm%1 xmm%1
+    %define ymmymm%1 ymm%1
+    %define xm%1 xmm %+ m%1
+    %define ym%1 ymm %+ m%1
+%endmacro
+
+%assign i 0
+%rep 16
+    DECLARE_MMCAST i
+    %assign i i+1
+%endrep
+
+; I often want to use macros that permute their arguments. e.g. there's no
+; efficient way to implement butterfly or transpose or dct without swapping some
+; arguments.
+;
+; I would like to not have to manually keep track of the permutations:
+; If I insert a permutation in the middle of a function, it should automatically
+; change everything that follows. For more complex macros I may also have multiple
+; implementations, e.g. the SSE2 and SSSE3 versions may have different permutations.
+;
+; Hence these macros. Insert a PERMUTE or some SWAPs at the end of a macro that
+; permutes its arguments. It's equivalent to exchanging the contents of the
+; registers, except that this way you exchange the register names instead, so it
+; doesn't cost any cycles.
+
+%macro PERMUTE 2-* ; takes a list of pairs to swap
+    %rep %0/2
+        %xdefine %%tmp%2 m%2
+        %rotate 2
+    %endrep
+    %rep %0/2
+        %xdefine m%1 %%tmp%2
+        CAT_XDEFINE nn, m%1, %1
+        %rotate 2
+    %endrep
+%endmacro
+
+%macro SWAP 2+ ; swaps a single chain (sometimes more concise than pairs)
+    %ifnum %1 ; SWAP 0, 1, ...
+        SWAP_INTERNAL_NUM %1, %2
+    %else ; SWAP m0, m1, ...
+        SWAP_INTERNAL_NAME %1, %2
+    %endif
+%endmacro
+
+%macro SWAP_INTERNAL_NUM 2-*
+    %rep %0-1
+        %xdefine %%tmp m%1
+        %xdefine m%1 m%2
+        %xdefine m%2 %%tmp
+        CAT_XDEFINE nn, m%1, %1
+        CAT_XDEFINE nn, m%2, %2
+        %rotate 1
+    %endrep
+%endmacro
+
+%macro SWAP_INTERNAL_NAME 2-*
+    %xdefine %%args nn %+ %1
+    %rep %0-1
+        %xdefine %%args %%args, nn %+ %2
+        %rotate 1
+    %endrep
+    SWAP_INTERNAL_NUM %%args
+%endmacro
+
+; If SAVE_MM_PERMUTATION is placed at the end of a function, then any later
+; calls to that function will automatically load the permutation, so values can
+; be returned in mmregs.
+%macro SAVE_MM_PERMUTATION 0-1
+    %if %0
+        %xdefine %%f %1_m
+    %else
+        %xdefine %%f current_function %+ _m
+    %endif
+    %assign %%i 0
+    %rep num_mmregs
+        CAT_XDEFINE %%f, %%i, m %+ %%i
+        %assign %%i %%i+1
+    %endrep
+%endmacro
+
+%macro LOAD_MM_PERMUTATION 1 ; name to load from
+    %ifdef %1_m0
+        %assign %%i 0
+        %rep num_mmregs
+            CAT_XDEFINE m, %%i, %1_m %+ %%i
+            CAT_XDEFINE nn, m %+ %%i, %%i
+            %assign %%i %%i+1
+        %endrep
+    %endif
+%endmacro
+
+; Append cpuflags to the callee's name iff the appended name is known and the plain name isn't
+%macro call 1
+    call_internal %1 %+ SUFFIX, %1
+%endmacro
+%macro call_internal 2
+    %xdefine %%i %2
+    %ifndef cglobaled_%2
+        %ifdef cglobaled_%1
+            %xdefine %%i %1
+        %endif
+    %endif
+    call %%i
+    LOAD_MM_PERMUTATION %%i
+%endmacro
+
+; Substitutions that reduce instruction size but are functionally equivalent
+%macro add 2
+    %ifnum %2
+        %if %2==128
+            sub %1, -128
+        %else
+            add %1, %2
+        %endif
+    %else
+        add %1, %2
+    %endif
+%endmacro
+
+%macro sub 2
+    %ifnum %2
+        %if %2==128
+            add %1, -128
+        %else
+            sub %1, %2
+        %endif
+    %else
+        sub %1, %2
+    %endif
+%endmacro
+
+;=============================================================================
+; AVX abstraction layer
+;=============================================================================
+
+%assign i 0
+%rep 16
+    %if i < 8
+        CAT_XDEFINE sizeofmm, i, 8
+    %endif
+    CAT_XDEFINE sizeofxmm, i, 16
+    CAT_XDEFINE sizeofymm, i, 32
+    %assign i i+1
+%endrep
+%undef i
+
+%macro CHECK_AVX_INSTR_EMU 3-*
+    %xdefine %%opcode %1
+    %xdefine %%dst %2
+    %rep %0-2
+        %ifidn %%dst, %3
+            %error non-avx emulation of ``%%opcode'' is not supported
+        %endif
+        %rotate 1
+    %endrep
+%endmacro
+
+;%1 == instruction
+;%2 == minimal instruction set
+;%3 == 1 if float, 0 if int
+;%4 == 1 if 4-operand emulation, 0 if 3-operand emulation, 255 otherwise (no emulation)
+;%5 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not
+;%6+: operands
+%macro RUN_AVX_INSTR 6-9+
+    %ifnum sizeof%7
+        %assign __sizeofreg sizeof%7
+    %elifnum sizeof%6
+        %assign __sizeofreg sizeof%6
+    %else
+        %assign __sizeofreg mmsize
+    %endif
+    %assign __emulate_avx 0
+    %if avx_enabled && __sizeofreg >= 16
+        %xdefine __instr v%1
+    %else
+        %xdefine __instr %1
+        %if %0 >= 8+%4
+            %assign __emulate_avx 1
+        %endif
+    %endif
+    %ifnidn %2, fnord
+        %ifdef cpuname
+            %if notcpuflag(%2)
+                %error use of ``%1'' %2 instruction in cpuname function: current_function
+            %elif cpuflags_%2 < cpuflags_sse && notcpuflag(sse2) && __sizeofreg > 8
+                %error use of ``%1'' sse2 instruction in cpuname function: current_function
+            %endif
+        %endif
+    %endif
+
+    %if __emulate_avx
+        %xdefine __src1 %7
+        %xdefine __src2 %8
+        %if %5 && %4 == 0
+            %ifnidn %6, %7
+                %ifidn %6, %8
+                    %xdefine __src1 %8
+                    %xdefine __src2 %7
+                %elifnnum sizeof%8
+                    ; 3-operand AVX instructions with a memory arg can only have it in src2,
+                    ; whereas SSE emulation prefers to have it in src1 (i.e. the mov).
+                    ; So, if the instruction is commutative with a memory arg, swap them.
+                    %xdefine __src1 %8
+                    %xdefine __src2 %7
+                %endif
+            %endif
+        %endif
+        %ifnidn %6, __src1
+            %if %0 >= 9
+                CHECK_AVX_INSTR_EMU {%1 %6, %7, %8, %9}, %6, __src2, %9
+            %else
+                CHECK_AVX_INSTR_EMU {%1 %6, %7, %8}, %6, __src2
+            %endif
+            %if __sizeofreg == 8
+                MOVQ %6, __src1
+            %elif %3
+                MOVAPS %6, __src1
+            %else
+                MOVDQA %6, __src1
+            %endif
+        %endif
+        %if %0 >= 9
+            %1 %6, __src2, %9
+        %else
+            %1 %6, __src2
+        %endif
+    %elif %0 >= 9
+        __instr %6, %7, %8, %9
+    %elif %0 == 8
+        __instr %6, %7, %8
+    %elif %0 == 7
+        __instr %6, %7
+    %else
+        __instr %6
+    %endif
+%endmacro
+
+;%1 == instruction
+;%2 == minimal instruction set
+;%3 == 1 if float, 0 if int
+;%4 == 1 if 4-operand emulation, 0 if 3-operand emulation, 255 otherwise (no emulation)
+;%5 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not
+%macro AVX_INSTR 1-5 fnord, 0, 255, 0
+    %macro %1 1-10 fnord, fnord, fnord, fnord, %1, %2, %3, %4, %5
+        %ifidn %2, fnord
+            RUN_AVX_INSTR %6, %7, %8, %9, %10, %1
+        %elifidn %3, fnord
+            RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2
+        %elifidn %4, fnord
+            RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3
+        %elifidn %5, fnord
+            RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3, %4
+        %else
+            RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3, %4, %5
+        %endif
+    %endmacro
+%endmacro
+
+; Instructions with both VEX and non-VEX encodings
+; Non-destructive instructions are written without parameters
+AVX_INSTR addpd, sse2, 1, 0, 1
+AVX_INSTR addps, sse, 1, 0, 1
+AVX_INSTR addsd, sse2, 1, 0, 0
+AVX_INSTR addss, sse, 1, 0, 0
+AVX_INSTR addsubpd, sse3, 1, 0, 0
+AVX_INSTR addsubps, sse3, 1, 0, 0
+AVX_INSTR aesdec, fnord, 0, 0, 0
+AVX_INSTR aesdeclast, fnord, 0, 0, 0
+AVX_INSTR aesenc, fnord, 0, 0, 0
+AVX_INSTR aesenclast, fnord, 0, 0, 0
+AVX_INSTR aesimc
+AVX_INSTR aeskeygenassist
+AVX_INSTR andnpd, sse2, 1, 0, 0
+AVX_INSTR andnps, sse, 1, 0, 0
+AVX_INSTR andpd, sse2, 1, 0, 1
+AVX_INSTR andps, sse, 1, 0, 1
+AVX_INSTR blendpd, sse4, 1, 1, 0
+AVX_INSTR blendps, sse4, 1, 1, 0
+AVX_INSTR blendvpd, sse4 ; can't be emulated
+AVX_INSTR blendvps, sse4 ; can't be emulated
+AVX_INSTR cmppd, sse2, 1, 1, 0
+AVX_INSTR cmpps, sse, 1, 1, 0
+AVX_INSTR cmpsd, sse2, 1, 1, 0
+AVX_INSTR cmpss, sse, 1, 1, 0
+AVX_INSTR comisd, sse2
+AVX_INSTR comiss, sse
+AVX_INSTR cvtdq2pd, sse2
+AVX_INSTR cvtdq2ps, sse2
+AVX_INSTR cvtpd2dq, sse2
+AVX_INSTR cvtpd2ps, sse2
+AVX_INSTR cvtps2dq, sse2
+AVX_INSTR cvtps2pd, sse2
+AVX_INSTR cvtsd2si, sse2
+AVX_INSTR cvtsd2ss, sse2, 1, 0, 0
+AVX_INSTR cvtsi2sd, sse2, 1, 0, 0
+AVX_INSTR cvtsi2ss, sse, 1, 0, 0
+AVX_INSTR cvtss2sd, sse2, 1, 0, 0
+AVX_INSTR cvtss2si, sse
+AVX_INSTR cvttpd2dq, sse2
+AVX_INSTR cvttps2dq, sse2
+AVX_INSTR cvttsd2si, sse2
+AVX_INSTR cvttss2si, sse
+AVX_INSTR divpd, sse2, 1, 0, 0
+AVX_INSTR divps, sse, 1, 0, 0
+AVX_INSTR divsd, sse2, 1, 0, 0
+AVX_INSTR divss, sse, 1, 0, 0
+AVX_INSTR dppd, sse4, 1, 1, 0
+AVX_INSTR dpps, sse4, 1, 1, 0
+AVX_INSTR extractps, sse4
+AVX_INSTR haddpd, sse3, 1, 0, 0
+AVX_INSTR haddps, sse3, 1, 0, 0
+AVX_INSTR hsubpd, sse3, 1, 0, 0
+AVX_INSTR hsubps, sse3, 1, 0, 0
+AVX_INSTR insertps, sse4, 1, 1, 0
+AVX_INSTR lddqu, sse3
+AVX_INSTR ldmxcsr, sse
+AVX_INSTR maskmovdqu, sse2
+AVX_INSTR maxpd, sse2, 1, 0, 1
+AVX_INSTR maxps, sse, 1, 0, 1
+AVX_INSTR maxsd, sse2, 1, 0, 0
+AVX_INSTR maxss, sse, 1, 0, 0
+AVX_INSTR minpd, sse2, 1, 0, 1
+AVX_INSTR minps, sse, 1, 0, 1
+AVX_INSTR minsd, sse2, 1, 0, 0
+AVX_INSTR minss, sse, 1, 0, 0
+AVX_INSTR movapd, sse2
+AVX_INSTR movaps, sse
+AVX_INSTR movd, mmx
+AVX_INSTR movddup, sse3
+AVX_INSTR movdqa, sse2
+AVX_INSTR movdqu, sse2
+AVX_INSTR movhlps, sse, 1, 0, 0
+AVX_INSTR movhpd, sse2, 1, 0, 0
+AVX_INSTR movhps, sse, 1, 0, 0
+AVX_INSTR movlhps, sse, 1, 0, 0
+AVX_INSTR movlpd, sse2, 1, 0, 0
+AVX_INSTR movlps, sse, 1, 0, 0
+AVX_INSTR movmskpd, sse2
+AVX_INSTR movmskps, sse
+AVX_INSTR movntdq, sse2
+AVX_INSTR movntdqa, sse4
+AVX_INSTR movntpd, sse2
+AVX_INSTR movntps, sse
+AVX_INSTR movq, mmx
+AVX_INSTR movsd, sse2, 1, 0, 0
+AVX_INSTR movshdup, sse3
+AVX_INSTR movsldup, sse3
+AVX_INSTR movss, sse, 1, 0, 0
+AVX_INSTR movupd, sse2
+AVX_INSTR movups, sse
+AVX_INSTR mpsadbw, sse4, 0, 1, 0
+AVX_INSTR mulpd, sse2, 1, 0, 1
+AVX_INSTR mulps, sse, 1, 0, 1
+AVX_INSTR mulsd, sse2, 1, 0, 0
+AVX_INSTR mulss, sse, 1, 0, 0
+AVX_INSTR orpd, sse2, 1, 0, 1
+AVX_INSTR orps, sse, 1, 0, 1
+AVX_INSTR pabsb, ssse3
+AVX_INSTR pabsd, ssse3
+AVX_INSTR pabsw, ssse3
+AVX_INSTR packsswb, mmx, 0, 0, 0
+AVX_INSTR packssdw, mmx, 0, 0, 0
+AVX_INSTR packuswb, mmx, 0, 0, 0
+AVX_INSTR packusdw, sse4, 0, 0, 0
+AVX_INSTR paddb, mmx, 0, 0, 1
+AVX_INSTR paddw, mmx, 0, 0, 1
+AVX_INSTR paddd, mmx, 0, 0, 1
+AVX_INSTR paddq, sse2, 0, 0, 1
+AVX_INSTR paddsb, mmx, 0, 0, 1
+AVX_INSTR paddsw, mmx, 0, 0, 1
+AVX_INSTR paddusb, mmx, 0, 0, 1
+AVX_INSTR paddusw, mmx, 0, 0, 1
+AVX_INSTR palignr, ssse3, 0, 1, 0
+AVX_INSTR pand, mmx, 0, 0, 1
+AVX_INSTR pandn, mmx, 0, 0, 0
+AVX_INSTR pavgb, mmx2, 0, 0, 1
+AVX_INSTR pavgw, mmx2, 0, 0, 1
+AVX_INSTR pblendvb, sse4 ; can't be emulated
+AVX_INSTR pblendw, sse4, 0, 1, 0
+AVX_INSTR pclmulqdq, fnord, 0, 1, 0
+AVX_INSTR pclmulhqhqdq, fnord, 0, 0, 0
+AVX_INSTR pclmulhqlqdq, fnord, 0, 0, 0
+AVX_INSTR pclmullqhqdq, fnord, 0, 0, 0
+AVX_INSTR pclmullqlqdq, fnord, 0, 0, 0
+AVX_INSTR pcmpestri, sse42
+AVX_INSTR pcmpestrm, sse42
+AVX_INSTR pcmpistri, sse42
+AVX_INSTR pcmpistrm, sse42
+AVX_INSTR pcmpeqb, mmx, 0, 0, 1
+AVX_INSTR pcmpeqw, mmx, 0, 0, 1
+AVX_INSTR pcmpeqd, mmx, 0, 0, 1
+AVX_INSTR pcmpeqq, sse4, 0, 0, 1
+AVX_INSTR pcmpgtb, mmx, 0, 0, 0
+AVX_INSTR pcmpgtw, mmx, 0, 0, 0
+AVX_INSTR pcmpgtd, mmx, 0, 0, 0
+AVX_INSTR pcmpgtq, sse42, 0, 0, 0
+AVX_INSTR pextrb, sse4
+AVX_INSTR pextrd, sse4
+AVX_INSTR pextrq, sse4
+AVX_INSTR pextrw, mmx2
+AVX_INSTR phaddw, ssse3, 0, 0, 0
+AVX_INSTR phaddd, ssse3, 0, 0, 0
+AVX_INSTR phaddsw, ssse3, 0, 0, 0
+AVX_INSTR phminposuw, sse4
+AVX_INSTR phsubw, ssse3, 0, 0, 0
+AVX_INSTR phsubd, ssse3, 0, 0, 0
+AVX_INSTR phsubsw, ssse3, 0, 0, 0
+AVX_INSTR pinsrb, sse4, 0, 1, 0
+AVX_INSTR pinsrd, sse4, 0, 1, 0
+AVX_INSTR pinsrq, sse4, 0, 1, 0
+AVX_INSTR pinsrw, mmx2, 0, 1, 0
+AVX_INSTR pmaddwd, mmx, 0, 0, 1
+AVX_INSTR pmaddubsw, ssse3, 0, 0, 0
+AVX_INSTR pmaxsb, sse4, 0, 0, 1
+AVX_INSTR pmaxsw, mmx2, 0, 0, 1
+AVX_INSTR pmaxsd, sse4, 0, 0, 1
+AVX_INSTR pmaxub, mmx2, 0, 0, 1
+AVX_INSTR pmaxuw, sse4, 0, 0, 1
+AVX_INSTR pmaxud, sse4, 0, 0, 1
+AVX_INSTR pminsb, sse4, 0, 0, 1
+AVX_INSTR pminsw, mmx2, 0, 0, 1
+AVX_INSTR pminsd, sse4, 0, 0, 1
+AVX_INSTR pminub, mmx2, 0, 0, 1
+AVX_INSTR pminuw, sse4, 0, 0, 1
+AVX_INSTR pminud, sse4, 0, 0, 1
+AVX_INSTR pmovmskb, mmx2
+AVX_INSTR pmovsxbw, sse4
+AVX_INSTR pmovsxbd, sse4
+AVX_INSTR pmovsxbq, sse4
+AVX_INSTR pmovsxwd, sse4
+AVX_INSTR pmovsxwq, sse4
+AVX_INSTR pmovsxdq, sse4
+AVX_INSTR pmovzxbw, sse4
+AVX_INSTR pmovzxbd, sse4
+AVX_INSTR pmovzxbq, sse4
+AVX_INSTR pmovzxwd, sse4
+AVX_INSTR pmovzxwq, sse4
+AVX_INSTR pmovzxdq, sse4
+AVX_INSTR pmuldq, sse4, 0, 0, 1
+AVX_INSTR pmulhrsw, ssse3, 0, 0, 1
+AVX_INSTR pmulhuw, mmx2, 0, 0, 1
+AVX_INSTR pmulhw, mmx, 0, 0, 1
+AVX_INSTR pmullw, mmx, 0, 0, 1
+AVX_INSTR pmulld, sse4, 0, 0, 1
+AVX_INSTR pmuludq, sse2, 0, 0, 1
+AVX_INSTR por, mmx, 0, 0, 1
+AVX_INSTR psadbw, mmx2, 0, 0, 1
+AVX_INSTR pshufb, ssse3, 0, 0, 0
+AVX_INSTR pshufd, sse2
+AVX_INSTR pshufhw, sse2
+AVX_INSTR pshuflw, sse2
+AVX_INSTR psignb, ssse3, 0, 0, 0
+AVX_INSTR psignw, ssse3, 0, 0, 0
+AVX_INSTR psignd, ssse3, 0, 0, 0
+AVX_INSTR psllw, mmx, 0, 0, 0
+AVX_INSTR pslld, mmx, 0, 0, 0
+AVX_INSTR psllq, mmx, 0, 0, 0
+AVX_INSTR pslldq, sse2, 0, 0, 0
+AVX_INSTR psraw, mmx, 0, 0, 0
+AVX_INSTR psrad, mmx, 0, 0, 0
+AVX_INSTR psrlw, mmx, 0, 0, 0
+AVX_INSTR psrld, mmx, 0, 0, 0
+AVX_INSTR psrlq, mmx, 0, 0, 0
+AVX_INSTR psrldq, sse2, 0, 0, 0
+AVX_INSTR psubb, mmx, 0, 0, 0
+AVX_INSTR psubw, mmx, 0, 0, 0
+AVX_INSTR psubd, mmx, 0, 0, 0
+AVX_INSTR psubq, sse2, 0, 0, 0
+AVX_INSTR psubsb, mmx, 0, 0, 0
+AVX_INSTR psubsw, mmx, 0, 0, 0
+AVX_INSTR psubusb, mmx, 0, 0, 0
+AVX_INSTR psubusw, mmx, 0, 0, 0
+AVX_INSTR ptest, sse4
+AVX_INSTR punpckhbw, mmx, 0, 0, 0
+AVX_INSTR punpckhwd, mmx, 0, 0, 0
+AVX_INSTR punpckhdq, mmx, 0, 0, 0
+AVX_INSTR punpckhqdq, sse2, 0, 0, 0
+AVX_INSTR punpcklbw, mmx, 0, 0, 0
+AVX_INSTR punpcklwd, mmx, 0, 0, 0
+AVX_INSTR punpckldq, mmx, 0, 0, 0
+AVX_INSTR punpcklqdq, sse2, 0, 0, 0
+AVX_INSTR pxor, mmx, 0, 0, 1
+AVX_INSTR rcpps, sse
+AVX_INSTR rcpss, sse, 1, 0, 0
+AVX_INSTR roundpd, sse4
+AVX_INSTR roundps, sse4
+AVX_INSTR roundsd, sse4, 1, 1, 0
+AVX_INSTR roundss, sse4, 1, 1, 0
+AVX_INSTR rsqrtps, sse
+AVX_INSTR rsqrtss, sse, 1, 0, 0
+AVX_INSTR shufpd, sse2, 1, 1, 0
+AVX_INSTR shufps, sse, 1, 1, 0
+AVX_INSTR sqrtpd, sse2
+AVX_INSTR sqrtps, sse
+AVX_INSTR sqrtsd, sse2, 1, 0, 0
+AVX_INSTR sqrtss, sse, 1, 0, 0
+AVX_INSTR stmxcsr, sse
+AVX_INSTR subpd, sse2, 1, 0, 0
+AVX_INSTR subps, sse, 1, 0, 0
+AVX_INSTR subsd, sse2, 1, 0, 0
+AVX_INSTR subss, sse, 1, 0, 0
+AVX_INSTR ucomisd, sse2
+AVX_INSTR ucomiss, sse
+AVX_INSTR unpckhpd, sse2, 1, 0, 0
+AVX_INSTR unpckhps, sse, 1, 0, 0
+AVX_INSTR unpcklpd, sse2, 1, 0, 0
+AVX_INSTR unpcklps, sse, 1, 0, 0
+AVX_INSTR xorpd, sse2, 1, 0, 1
+AVX_INSTR xorps, sse, 1, 0, 1
+
+; 3DNow instructions, for sharing code between AVX, SSE and 3DN
+AVX_INSTR pfadd, 3dnow, 1, 0, 1
+AVX_INSTR pfsub, 3dnow, 1, 0, 0
+AVX_INSTR pfmul, 3dnow, 1, 0, 1
+
+; base-4 constants for shuffles
+%assign i 0
+%rep 256
+    %assign j ((i>>6)&3)*1000 + ((i>>4)&3)*100 + ((i>>2)&3)*10 + (i&3)
+    %if j < 10
+        CAT_XDEFINE q000, j, i
+    %elif j < 100
+        CAT_XDEFINE q00, j, i
+    %elif j < 1000
+        CAT_XDEFINE q0, j, i
+    %else
+        CAT_XDEFINE q, j, i
+    %endif
+    %assign i i+1
+%endrep
+%undef i
+%undef j
+
+%macro FMA_INSTR 3
+    %macro %1 4-7 %1, %2, %3
+        %if cpuflag(xop)
+            v%5 %1, %2, %3, %4
+        %elifnidn %1, %4
+            %6 %1, %2, %3
+            %7 %1, %4
+        %else
+            %error non-xop emulation of ``%5 %1, %2, %3, %4'' is not supported
+        %endif
+    %endmacro
+%endmacro
+
+FMA_INSTR  pmacsww,  pmullw, paddw
+FMA_INSTR  pmacsdd,  pmulld, paddd ; sse4 emulation
+FMA_INSTR pmacsdql,  pmuldq, paddq ; sse4 emulation
+FMA_INSTR pmadcswd, pmaddwd, paddd
+
+; tzcnt is equivalent to "rep bsf" and is backwards-compatible with bsf.
+; This lets us use tzcnt without bumping the yasm version requirement yet.
+%define tzcnt rep bsf
+
+; Macros for consolidating FMA3 and FMA4 using 4-operand (dst, src1, src2, src3) syntax.
+; FMA3 is only possible if dst is the same as one of the src registers.
+; Either src2 or src3 can be a memory operand.
+%macro FMA4_INSTR 2-*
+    %push fma4_instr
+    %xdefine %$prefix %1
+    %rep %0 - 1
+        %macro %$prefix%2 4-6 %$prefix, %2
+            %if notcpuflag(fma3) && notcpuflag(fma4)
+                %error use of ``%5%6'' fma instruction in cpuname function: current_function
+            %elif cpuflag(fma4)
+                v%5%6 %1, %2, %3, %4
+            %elifidn %1, %2
+                ; If %3 or %4 is a memory operand it needs to be encoded as the last operand.
+                %ifnum sizeof%3
+                    v%{5}213%6 %2, %3, %4
+                %else
+                    v%{5}132%6 %2, %4, %3
+                %endif
+            %elifidn %1, %3
+                v%{5}213%6 %3, %2, %4
+            %elifidn %1, %4
+                v%{5}231%6 %4, %2, %3
+            %else
+                %error fma3 emulation of ``%5%6 %1, %2, %3, %4'' is not supported
+            %endif
+        %endmacro
+        %rotate 1
+    %endrep
+    %pop
+%endmacro
+
+FMA4_INSTR fmadd,    pd, ps, sd, ss
+FMA4_INSTR fmaddsub, pd, ps
+FMA4_INSTR fmsub,    pd, ps, sd, ss
+FMA4_INSTR fmsubadd, pd, ps
+FMA4_INSTR fnmadd,   pd, ps, sd, ss
+FMA4_INSTR fnmsub,   pd, ps, sd, ss
+
+; workaround: vpbroadcastq is broken in x86_32 due to a yasm bug (fixed in 1.3.0)
+%ifdef __YASM_VER__
+    %if __YASM_VERSION_ID__ < 0x01030000 && ARCH_X86_64 == 0
+        %macro vpbroadcastq 2
+            %if sizeof%1 == 16
+                movddup %1, %2
+            %else
+                vbroadcastsd %1, %2
+            %endif
+        %endmacro
+    %endif
+%endif
diff --git a/src/x86util.asm b/src/x86util.asm
new file mode 100644
index 0000000..66280b2
--- /dev/null
+++ b/src/x86util.asm
@@ -0,0 +1,695 @@
+;*****************************************************************************
+;* x86util.asm
+;*****************************************************************************
+;* Copyright (C) 2008-2010 x264 project
+;*
+;* Authors: Loren Merritt <lorenm@u.washington.edu>
+;*          Holger Lubitz <holger@lubitz.org>
+;*
+;* This file is part of Libav.
+;*
+;* Libav is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* Libav is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with Libav; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%define private_prefix qms
+%define public_prefix  qms
+%define cpuflags_mmxext cpuflags_mmx2
+
+%include "config.asm"
+
+%include "x86inc.asm"
+
+%macro SBUTTERFLY 4
+%if avx_enabled == 0
+    mova      m%4, m%2
+    punpckl%1 m%2, m%3
+    punpckh%1 m%4, m%3
+%else
+    punpckh%1 m%4, m%2, m%3
+    punpckl%1 m%2, m%3
+%endif
+    SWAP %3, %4
+%endmacro
+
+%macro SBUTTERFLY2 4
+    punpckl%1 m%4, m%2, m%3
+    punpckh%1 m%2, m%2, m%3
+    SWAP %2, %4, %3
+%endmacro
+
+%macro SBUTTERFLYPS 3
+    unpcklps m%3, m%1, m%2
+    unpckhps m%1, m%1, m%2
+    SWAP %1, %3, %2
+%endmacro
+
+%macro TRANSPOSE4x4B 5
+    SBUTTERFLY bw, %1, %2, %5
+    SBUTTERFLY bw, %3, %4, %5
+    SBUTTERFLY wd, %1, %3, %5
+    SBUTTERFLY wd, %2, %4, %5
+    SWAP %2, %3
+%endmacro
+
+%macro TRANSPOSE4x4W 5
+    SBUTTERFLY wd, %1, %2, %5
+    SBUTTERFLY wd, %3, %4, %5
+    SBUTTERFLY dq, %1, %3, %5
+    SBUTTERFLY dq, %2, %4, %5
+    SWAP %2, %3
+%endmacro
+
+%macro TRANSPOSE2x4x4W 5
+    SBUTTERFLY wd,  %1, %2, %5
+    SBUTTERFLY wd,  %3, %4, %5
+    SBUTTERFLY dq,  %1, %3, %5
+    SBUTTERFLY dq,  %2, %4, %5
+    SBUTTERFLY qdq, %1, %2, %5
+    SBUTTERFLY qdq, %3, %4, %5
+%endmacro
+
+%macro TRANSPOSE4x4D 5
+    SBUTTERFLY dq,  %1, %2, %5
+    SBUTTERFLY dq,  %3, %4, %5
+    SBUTTERFLY qdq, %1, %3, %5
+    SBUTTERFLY qdq, %2, %4, %5
+    SWAP %2, %3
+%endmacro
+
+; identical behavior to TRANSPOSE4x4D, but using SSE1 float ops
+%macro TRANSPOSE4x4PS 5
+    SBUTTERFLYPS %1, %2, %5
+    SBUTTERFLYPS %3, %4, %5
+    movlhps m%5, m%1, m%3
+    movhlps m%3, m%1
+    SWAP %5, %1
+    movlhps m%5, m%2, m%4
+    movhlps m%4, m%2
+    SWAP %5, %2, %3
+%endmacro
+
+%macro TRANSPOSE8x8W 9-11
+%if ARCH_X86_64
+    SBUTTERFLY wd,  %1, %2, %9
+    SBUTTERFLY wd,  %3, %4, %9
+    SBUTTERFLY wd,  %5, %6, %9
+    SBUTTERFLY wd,  %7, %8, %9
+    SBUTTERFLY dq,  %1, %3, %9
+    SBUTTERFLY dq,  %2, %4, %9
+    SBUTTERFLY dq,  %5, %7, %9
+    SBUTTERFLY dq,  %6, %8, %9
+    SBUTTERFLY qdq, %1, %5, %9
+    SBUTTERFLY qdq, %2, %6, %9
+    SBUTTERFLY qdq, %3, %7, %9
+    SBUTTERFLY qdq, %4, %8, %9
+    SWAP %2, %5
+    SWAP %4, %7
+%else
+; in:  m0..m7, unless %11 in which case m6 is in %9
+; out: m0..m7, unless %11 in which case m4 is in %10
+; spills into %9 and %10
+%if %0<11
+    movdqa %9, m%7
+%endif
+    SBUTTERFLY wd,  %1, %2, %7
+    movdqa %10, m%2
+    movdqa m%7, %9
+    SBUTTERFLY wd,  %3, %4, %2
+    SBUTTERFLY wd,  %5, %6, %2
+    SBUTTERFLY wd,  %7, %8, %2
+    SBUTTERFLY dq,  %1, %3, %2
+    movdqa %9, m%3
+    movdqa m%2, %10
+    SBUTTERFLY dq,  %2, %4, %3
+    SBUTTERFLY dq,  %5, %7, %3
+    SBUTTERFLY dq,  %6, %8, %3
+    SBUTTERFLY qdq, %1, %5, %3
+    SBUTTERFLY qdq, %2, %6, %3
+    movdqa %10, m%2
+    movdqa m%3, %9
+    SBUTTERFLY qdq, %3, %7, %2
+    SBUTTERFLY qdq, %4, %8, %2
+    SWAP %2, %5
+    SWAP %4, %7
+%if %0<11
+    movdqa m%5, %10
+%endif
+%endif
+%endmacro
+
+; PABSW macro assumes %1 != %2, while ABS1/2 macros work in-place
+%macro PABSW 2
+%if cpuflag(ssse3)
+    pabsw      %1, %2
+%elif cpuflag(mmxext)
+    pxor    %1, %1
+    psubw   %1, %2
+    pmaxsw  %1, %2
+%else
+    pxor       %1, %1
+    pcmpgtw    %1, %2
+    pxor       %2, %1
+    psubw      %2, %1
+    SWAP       %1, %2
+%endif
+%endmacro
+
+%macro PSIGNW_MMX 2
+    pxor       %1, %2
+    psubw      %1, %2
+%endmacro
+
+%macro PSIGNW_SSSE3 2
+    psignw     %1, %2
+%endmacro
+
+%macro ABS1 2
+%if cpuflag(ssse3)
+    pabsw   %1, %1
+%elif cpuflag(mmxext) ; a, tmp
+    pxor    %2, %2
+    psubw   %2, %1
+    pmaxsw  %1, %2
+%else ; a, tmp
+    pxor       %2, %2
+    pcmpgtw    %2, %1
+    pxor       %1, %2
+    psubw      %1, %2
+%endif
+%endmacro
+
+%macro ABS2 4
+%if cpuflag(ssse3)
+    pabsw   %1, %1
+    pabsw   %2, %2
+%elif cpuflag(mmxext) ; a, b, tmp0, tmp1
+    pxor    %3, %3
+    pxor    %4, %4
+    psubw   %3, %1
+    psubw   %4, %2
+    pmaxsw  %1, %3
+    pmaxsw  %2, %4
+%else ; a, b, tmp0, tmp1
+    pxor       %3, %3
+    pxor       %4, %4
+    pcmpgtw    %3, %1
+    pcmpgtw    %4, %2
+    pxor       %1, %3
+    pxor       %2, %4
+    psubw      %1, %3
+    psubw      %2, %4
+%endif
+%endmacro
+
+%macro ABSB 2 ; source mmreg, temp mmreg (unused for ssse3)
+%if cpuflag(ssse3)
+    pabsb   %1, %1
+%else
+    pxor    %2, %2
+    psubb   %2, %1
+    pminub  %1, %2
+%endif
+%endmacro
+
+%macro ABSB2 4 ; src1, src2, tmp1, tmp2 (tmp1/2 unused for SSSE3)
+%if cpuflag(ssse3)
+    pabsb   %1, %1
+    pabsb   %2, %2
+%else
+    pxor    %3, %3
+    pxor    %4, %4
+    psubb   %3, %1
+    psubb   %4, %2
+    pminub  %1, %3
+    pminub  %2, %4
+%endif
+%endmacro
+
+%macro ABSD2_MMX 4
+    pxor    %3, %3
+    pxor    %4, %4
+    pcmpgtd %3, %1
+    pcmpgtd %4, %2
+    pxor    %1, %3
+    pxor    %2, %4
+    psubd   %1, %3
+    psubd   %2, %4
+%endmacro
+
+%macro ABS4 6
+    ABS2 %1, %2, %5, %6
+    ABS2 %3, %4, %5, %6
+%endmacro
+
+%macro SPLATB_LOAD 3
+%if cpuflag(ssse3)
+    movd      %1, [%2-3]
+    pshufb    %1, %3
+%else
+    movd      %1, [%2-3] ;to avoid crossing a cacheline
+    punpcklbw %1, %1
+    SPLATW    %1, %1, 3
+%endif
+%endmacro
+
+%macro SPLATB_REG 3
+%if cpuflag(ssse3)
+    movd      %1, %2d
+    pshufb    %1, %3
+%else
+    movd      %1, %2d
+    punpcklbw %1, %1
+    SPLATW    %1, %1, 0
+%endif
+%endmacro
+
+%macro PALIGNR 4-5
+%if cpuflag(ssse3)
+%if %0==5
+    palignr %1, %2, %3, %4
+%else
+    palignr %1, %2, %3
+%endif
+%elif cpuflag(mmx) ; [dst,] src1, src2, imm, tmp
+    %define %%dst %1
+%if %0==5
+%ifnidn %1, %2
+    mova    %%dst, %2
+%endif
+    %rotate 1
+%endif
+%ifnidn %4, %2
+    mova    %4, %2
+%endif
+%if mmsize==8
+    psllq   %%dst, (8-%3)*8
+    psrlq   %4, %3*8
+%else
+    pslldq  %%dst, 16-%3
+    psrldq  %4, %3
+%endif
+    por     %%dst, %4
+%endif
+%endmacro
+
+%macro PAVGB 2
+%if cpuflag(mmxext)
+    pavgb   %1, %2
+%elif cpuflag(3dnow)
+    pavgusb %1, %2
+%endif
+%endmacro
+
+%macro PSHUFLW 1+
+    %if mmsize == 8
+        pshufw %1
+    %else
+        pshuflw %1
+    %endif
+%endmacro
+
+%macro PSWAPD 2
+%if cpuflag(mmxext)
+    pshufw    %1, %2, q1032
+%elif cpuflag(3dnowext)
+    pswapd    %1, %2
+%elif cpuflag(3dnow)
+    movq      %1, %2
+    psrlq     %1, 32
+    punpckldq %1, %2
+%endif
+%endmacro
+
+%macro DEINTB 5 ; mask, reg1, mask, reg2, optional src to fill masks from
+%ifnum %5
+    pand   m%3, m%5, m%4 ; src .. y6 .. y4
+    pand   m%1, m%5, m%2 ; dst .. y6 .. y4
+%else
+    mova   m%1, %5
+    pand   m%3, m%1, m%4 ; src .. y6 .. y4
+    pand   m%1, m%1, m%2 ; dst .. y6 .. y4
+%endif
+    psrlw  m%2, 8        ; dst .. y7 .. y5
+    psrlw  m%4, 8        ; src .. y7 .. y5
+%endmacro
+
+%macro SUMSUB_BA 3-4
+%if %0==3
+    padd%1  m%2, m%3
+    padd%1  m%3, m%3
+    psub%1  m%3, m%2
+%else
+%if avx_enabled == 0
+    mova    m%4, m%2
+    padd%1  m%2, m%3
+    psub%1  m%3, m%4
+%else
+    padd%1  m%4, m%2, m%3
+    psub%1  m%3, m%2
+    SWAP    %2, %4
+%endif
+%endif
+%endmacro
+
+%macro SUMSUB_BADC 5-6
+%if %0==6
+    SUMSUB_BA %1, %2, %3, %6
+    SUMSUB_BA %1, %4, %5, %6
+%else
+    padd%1  m%2, m%3
+    padd%1  m%4, m%5
+    padd%1  m%3, m%3
+    padd%1  m%5, m%5
+    psub%1  m%3, m%2
+    psub%1  m%5, m%4
+%endif
+%endmacro
+
+%macro SUMSUB2_AB 4
+%ifnum %3
+    psub%1  m%4, m%2, m%3
+    psub%1  m%4, m%3
+    padd%1  m%2, m%2
+    padd%1  m%2, m%3
+%else
+    mova    m%4, m%2
+    padd%1  m%2, m%2
+    padd%1  m%2, %3
+    psub%1  m%4, %3
+    psub%1  m%4, %3
+%endif
+%endmacro
+
+%macro SUMSUB2_BA 4
+%if avx_enabled == 0
+    mova    m%4, m%2
+    padd%1  m%2, m%3
+    padd%1  m%2, m%3
+    psub%1  m%3, m%4
+    psub%1  m%3, m%4
+%else
+    padd%1  m%4, m%2, m%3
+    padd%1  m%4, m%3
+    psub%1  m%3, m%2
+    psub%1  m%3, m%2
+    SWAP     %2,  %4
+%endif
+%endmacro
+
+%macro SUMSUBD2_AB 5
+%ifnum %4
+    psra%1  m%5, m%2, 1  ; %3: %3>>1
+    psra%1  m%4, m%3, 1  ; %2: %2>>1
+    padd%1  m%4, m%2     ; %3: %3>>1+%2
+    psub%1  m%5, m%3     ; %2: %2>>1-%3
+    SWAP     %2, %5
+    SWAP     %3, %4
+%else
+    mova    %5, m%2
+    mova    %4, m%3
+    psra%1  m%3, 1  ; %3: %3>>1
+    psra%1  m%2, 1  ; %2: %2>>1
+    padd%1  m%3, %5 ; %3: %3>>1+%2
+    psub%1  m%2, %4 ; %2: %2>>1-%3
+%endif
+%endmacro
+
+%macro DCT4_1D 5
+%ifnum %5
+    SUMSUB_BADC w, %4, %1, %3, %2, %5
+    SUMSUB_BA   w, %3, %4, %5
+    SUMSUB2_AB  w, %1, %2, %5
+    SWAP %1, %3, %4, %5, %2
+%else
+    SUMSUB_BADC w, %4, %1, %3, %2
+    SUMSUB_BA   w, %3, %4
+    mova     [%5], m%2
+    SUMSUB2_AB  w, %1, [%5], %2
+    SWAP %1, %3, %4, %2
+%endif
+%endmacro
+
+%macro IDCT4_1D 6-7
+%ifnum %6
+    SUMSUBD2_AB %1, %3, %5, %7, %6
+    ; %3: %3>>1-%5 %5: %3+%5>>1
+    SUMSUB_BA   %1, %4, %2, %7
+    ; %4: %2+%4 %2: %2-%4
+    SUMSUB_BADC %1, %5, %4, %3, %2, %7
+    ; %5: %2+%4 + (%3+%5>>1)
+    ; %4: %2+%4 - (%3+%5>>1)
+    ; %3: %2-%4 + (%3>>1-%5)
+    ; %2: %2-%4 - (%3>>1-%5)
+%else
+%ifidn %1, w
+    SUMSUBD2_AB %1, %3, %5, [%6], [%6+16]
+%else
+    SUMSUBD2_AB %1, %3, %5, [%6], [%6+32]
+%endif
+    SUMSUB_BA   %1, %4, %2
+    SUMSUB_BADC %1, %5, %4, %3, %2
+%endif
+    SWAP %2, %5, %4
+    ; %2: %2+%4 + (%3+%5>>1) row0
+    ; %3: %2-%4 + (%3>>1-%5) row1
+    ; %4: %2-%4 - (%3>>1-%5) row2
+    ; %5: %2+%4 - (%3+%5>>1) row3
+%endmacro
+
+
+%macro LOAD_DIFF 5
+%ifidn %3, none
+    movh       %1, %4
+    movh       %2, %5
+    punpcklbw  %1, %2
+    punpcklbw  %2, %2
+    psubw      %1, %2
+%else
+    movh       %1, %4
+    punpcklbw  %1, %3
+    movh       %2, %5
+    punpcklbw  %2, %3
+    psubw      %1, %2
+%endif
+%endmacro
+
+%macro STORE_DCT 6
+    movq   [%5+%6+ 0], m%1
+    movq   [%5+%6+ 8], m%2
+    movq   [%5+%6+16], m%3
+    movq   [%5+%6+24], m%4
+    movhps [%5+%6+32], m%1
+    movhps [%5+%6+40], m%2
+    movhps [%5+%6+48], m%3
+    movhps [%5+%6+56], m%4
+%endmacro
+
+%macro LOAD_DIFF_8x4P 7-10 r0,r2,0 ; 4x dest, 2x temp, 2x pointer, increment?
+    LOAD_DIFF m%1, m%5, m%7, [%8],      [%9]
+    LOAD_DIFF m%2, m%6, m%7, [%8+r1],   [%9+r3]
+    LOAD_DIFF m%3, m%5, m%7, [%8+2*r1], [%9+2*r3]
+    LOAD_DIFF m%4, m%6, m%7, [%8+r4],   [%9+r5]
+%if %10
+    lea %8, [%8+4*r1]
+    lea %9, [%9+4*r3]
+%endif
+%endmacro
+
+%macro DIFFx2 6-7
+    movh       %3, %5
+    punpcklbw  %3, %4
+    psraw      %1, 6
+    paddsw     %1, %3
+    movh       %3, %6
+    punpcklbw  %3, %4
+    psraw      %2, 6
+    paddsw     %2, %3
+    packuswb   %2, %1
+%endmacro
+
+%macro STORE_DIFF 4
+    movh       %2, %4
+    punpcklbw  %2, %3
+    psraw      %1, 6
+    paddsw     %1, %2
+    packuswb   %1, %1
+    movh       %4, %1
+%endmacro
+
+%macro STORE_DIFFx2 8 ; add1, add2, reg1, reg2, zero, shift, source, stride
+    movh       %3, [%7]
+    movh       %4, [%7+%8]
+    psraw      %1, %6
+    psraw      %2, %6
+    punpcklbw  %3, %5
+    punpcklbw  %4, %5
+    paddw      %3, %1
+    paddw      %4, %2
+    packuswb   %3, %5
+    packuswb   %4, %5
+    movh     [%7], %3
+    movh  [%7+%8], %4
+%endmacro
+
+%macro PMINUB 3 ; dst, src, ignored
+%if cpuflag(mmxext)
+    pminub   %1, %2
+%else ; dst, src, tmp
+    mova     %3, %1
+    psubusb  %3, %2
+    psubb    %1, %3
+%endif
+%endmacro
+
+%macro SPLATW 2-3 0
+%if mmsize == 16
+    pshuflw    %1, %2, (%3)*0x55
+    punpcklqdq %1, %1
+%elif cpuflag(mmxext)
+    pshufw     %1, %2, (%3)*0x55
+%else
+    %ifnidn %1, %2
+        mova       %1, %2
+    %endif
+    %if %3 & 2
+        punpckhwd  %1, %1
+    %else
+        punpcklwd  %1, %1
+    %endif
+    %if %3 & 1
+        punpckhwd  %1, %1
+    %else
+        punpcklwd  %1, %1
+    %endif
+%endif
+%endmacro
+
+%macro SPLATD 1
+%if mmsize == 8
+    punpckldq  %1, %1
+%elif cpuflag(sse2)
+    pshufd  %1, %1, 0
+%elif cpuflag(sse)
+    shufps  %1, %1, 0
+%endif
+%endmacro
+
+%macro CLIPW 3 ;(dst, min, max)
+    pmaxsw %1, %2
+    pminsw %1, %3
+%endmacro
+
+%macro PMINSD_MMX 3 ; dst, src, tmp
+    mova      %3, %2
+    pcmpgtd   %3, %1
+    pxor      %1, %2
+    pand      %1, %3
+    pxor      %1, %2
+%endmacro
+
+%macro PMAXSD_MMX 3 ; dst, src, tmp
+    mova      %3, %1
+    pcmpgtd   %3, %2
+    pand      %1, %3
+    pandn     %3, %2
+    por       %1, %3
+%endmacro
+
+%macro CLIPD_MMX 3-4 ; src/dst, min, max, tmp
+    PMINSD_MMX %1, %3, %4
+    PMAXSD_MMX %1, %2, %4
+%endmacro
+
+%macro CLIPD_SSE2 3-4 ; src/dst, min (float), max (float), unused
+    cvtdq2ps  %1, %1
+    minps     %1, %3
+    maxps     %1, %2
+    cvtps2dq  %1, %1
+%endmacro
+
+%macro CLIPD_SSE41 3-4 ;  src/dst, min, max, unused
+    pminsd  %1, %3
+    pmaxsd  %1, %2
+%endmacro
+
+%macro VBROADCASTSS 2 ; dst xmm/ymm, src m32
+%if cpuflag(avx)
+    vbroadcastss %1, %2
+%else ; sse
+    movss        %1, %2
+    shufps       %1, %1, 0
+%endif
+%endmacro
+
+%macro VBROADCASTSD 2 ; dst xmm/ymm, src m64
+%if cpuflag(avx) && mmsize == 32
+    vbroadcastsd %1, %2
+%elif cpuflag(sse3)
+    movddup      %1, %2
+%else ; sse2
+    movsd        %1, %2
+    movlhps      %1, %1
+%endif
+%endmacro
+
+%macro SHUFFLE_MASK_W 8
+    %rep 8
+        %if %1>=0x80
+            db %1, %1
+        %else
+            db %1*2
+            db %1*2+1
+        %endif
+        %rotate 1
+    %endrep
+%endmacro
+
+%macro PMOVSXWD 2; dst, src
+%if cpuflag(sse4)
+    pmovsxwd     %1, %2
+%else
+    %ifnidn %1, %2
+    mova         %1, %2
+    %endif
+    punpcklwd    %1, %1
+    psrad        %1, 16
+%endif
+%endmacro
+
+; Wrapper for non-FMA version of fmaddps
+%macro FMULADD_PS 5
+    %if cpuflag(fma3) || cpuflag(fma4)
+        fmaddps %1, %2, %3, %4
+    %elifidn %1, %4
+        mulps   %5, %2, %3
+        addps   %1, %4, %5
+    %else
+        mulps   %1, %2, %3
+        addps   %1, %4
+    %endif
+%endmacro
+
+; Wrapper for non-FMA version of fmaddpd
+%macro FMULADD_PD 5
+    %if cpuflag(fma3) || cpuflag(fma4)
+        fmaddpd %1, %2, %3, %4
+    %elifidn %1, %4
+        mulpd   %5, %2, %3
+        addpd   %1, %4, %5
+    %else
+        mulpd   %1, %2, %3
+        addpd   %1, %4
+    %endif
+%endmacro
author	Anton Khirnov <anton@khirnov.net>	2018-04-07 18:13:49 +0200
committer	Anton Khirnov <anton@khirnov.net>	2018-04-07 18:13:49 +0200
commit	d53f73b7f7c728e96ffc07b55fa30b9e6fc5121c (patch)
tree	c79d588e646ea3e65b2db1532d65bb691ee88b0e