summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAnton Khirnov <anton@khirnov.net>2018-04-07 18:13:49 +0200
committerAnton Khirnov <anton@khirnov.net>2018-04-07 18:13:49 +0200
commitd53f73b7f7c728e96ffc07b55fa30b9e6fc5121c (patch)
treec79d588e646ea3e65b2db1532d65bb691ee88b0e
Initial commit.
-rw-r--r--configuration.ccl2
-rw-r--r--interface.ccl16
-rw-r--r--param.ccl37
-rw-r--r--schedule.ccl30
-rw-r--r--src/MinimalDistortion.m1490
-rw-r--r--src/basis.c281
-rw-r--r--src/basis.h45
-rw-r--r--src/bicgstab.c410
-rw-r--r--src/bicgstab.h60
-rw-r--r--src/common.h29
-rw-r--r--src/config.asm1325
-rw-r--r--src/expansion.asm91
-rw-r--r--src/gamma_freeze_template.c507
-rw-r--r--src/make.code.defn7
-rw-r--r--src/md.c573
-rw-r--r--src/md.h19
-rw-r--r--src/md_solve.c818
-rw-r--r--src/md_solve.h58
-rw-r--r--src/md_solve_template.c577
-rw-r--r--src/pssolve.c498
-rw-r--r--src/pssolve.h139
-rw-r--r--src/register.c7
-rw-r--r--src/threadpool.c174
-rw-r--r--src/threadpool.h32
-rw-r--r--src/x86inc.asm1544
-rw-r--r--src/x86util.asm695
26 files changed, 9464 insertions, 0 deletions
diff --git a/configuration.ccl b/configuration.ccl
new file mode 100644
index 0000000..3565166
--- /dev/null
+++ b/configuration.ccl
@@ -0,0 +1,2 @@
+# Configuration definition for thorn MinimalDistortionAxi
+
diff --git a/interface.ccl b/interface.ccl
new file mode 100644
index 0000000..703f924
--- /dev/null
+++ b/interface.ccl
@@ -0,0 +1,16 @@
+# Interface definition for thorn MinimalDistortionAxi
+implements: MinimalDistortionAxi
+
+INHERITS: ADMBase grid CoordBase MethodOfLines
+
+CCTK_INT FUNCTION MoLRegisterConstrained(CCTK_INT IN idx)
+CCTK_INT FUNCTION MoLRegisterSaveAndRestore(CCTK_INT IN idx)
+CCTK_INT FUNCTION MoLRegisterSaveAndRestoreGroup(CCTK_INT IN idx)
+
+REQUIRES FUNCTION MoLRegisterConstrained
+REQUIRES FUNCTION MoLRegisterSaveAndRestore
+REQUIRES FUNCTION MoLRegisterSaveAndRestoreGroup
+
+public:
+CCTK_REAL betax_coeffs TYPE=array DIM=2 SIZE=basis_order_z,basis_order_r DISTRIB=constant
+CCTK_REAL betaz_coeffs TYPE=array DIM=2 SIZE=basis_order_z,basis_order_r DISTRIB=constant
diff --git a/param.ccl b/param.ccl
new file mode 100644
index 0000000..3c3d285
--- /dev/null
+++ b/param.ccl
@@ -0,0 +1,37 @@
+# Parameter definitions for thorn MinimalDistortionAxi
+#
+SHARES: ADMBase
+EXTENDS KEYWORD shift_evolution_method
+{
+ "minimal_distortion_axi" :: "minimal distortion axi"
+}
+
+RESTRICTED:
+CCTK_INT basis_order_r "Number of the basis functions in the radial direction" STEERABLE=recover
+{
+ 1: :: ""
+} 40
+
+CCTK_INT basis_order_z "Number of the basis functions in the z direction" STEERABLE=recover
+{
+ 1: :: ""
+} 40
+
+CCTK_REAL filter_power "" STEERABLE=recover
+{
+ 0: :: ""
+} 64.0
+
+CCTK_REAL scale_factor "" STEERABLE=recover
+{
+ 0: :: ""
+} 64.0
+
+CCTK_REAL scale_power "" STEERABLE=recover
+{
+ 0: :: ""
+} 64.0
+
+BOOLEAN export_coeffs "Export the coefficients of the spectral expansion in beta*_coeffs" STEERABLE=recover
+{
+} "no"
diff --git a/schedule.ccl b/schedule.ccl
new file mode 100644
index 0000000..5976500
--- /dev/null
+++ b/schedule.ccl
@@ -0,0 +1,30 @@
+# Schedule definitions for thorn MinimalDistortionAxi
+#
+if (CCTK_Equals(shift_evolution_method, "minimal_distortion_axi")) {
+ SCHEDULE minimal_distortion_eval IN ML_BSSN_evolCalcGroup BEFORE ML_BSSN_RHS {
+ LANG: C
+ } "Minimal distortion shift eval"
+
+ SCHEDULE minimal_distortion_solve IN ML_BSSN_evolCalcGroup BEFORE minimal_distortion_eval {
+ #SCHEDULE minimal_distortion_solve IN MoL_PreStep {
+ LANG: C
+ } "Minimal distortion solve W"
+
+ #SCHEDULE quasimaximal_slicing_axi IN MoL_PseudoEvolution {
+ # LANG: C
+ #} "Quasimaximal slicing"
+
+ SCHEDULE minimal_distortion_init IN ADMBase_InitialData {
+ LANG: C
+ } ""
+
+ SCHEDULE minimal_distortion_axi_register_mol IN MoL_Register {
+ LANG: C
+ } ""
+
+
+ if (export_coeffs) {
+ STORAGE: betax_coeffs
+ STORAGE: betaz_coeffs
+ }
+}
diff --git a/src/MinimalDistortion.m b/src/MinimalDistortion.m
new file mode 100644
index 0000000..c0d34a6
--- /dev/null
+++ b/src/MinimalDistortion.m
@@ -0,0 +1,1490 @@
+
+SetEnhancedTimes[False];
+SetSourceLanguage["C"];
+
+(******************************************************************************)
+(* Options *)
+(******************************************************************************)
+
+createCode[derivOrder_, useJacobian_, splitUpwindDerivs_, evolutionTimelevels_, addMatter_, formulation_] :=
+Module[{prefix, suffix, thorn},
+
+prefix = "ML_";
+suffix =
+ ""
+ <> If [useJacobian, "_MP", ""]
+ <> If [derivOrder!=4, "_O" <> ToString[derivOrder], ""]
+ <> If [splitUpwindDerivs, "", "_UPW"]
+ (* <> If [evolutionTimelevels!=3, "_TL" <> ToString[evolutionTimelevels], ""] *)
+ (* <> If [addMatter==1, "_M", ""] *)
+ ;
+
+thorn = prefix <> formulation <> suffix;
+
+SetAttributes[IfCCZ4, HoldAll];
+IfCCZ4[expr_, else_:Sequence[]] := If[formulation === "CCZ4", expr, Unevaluated[else]];
+
+(******************************************************************************)
+(* Derivatives *)
+(******************************************************************************)
+
+KD = KroneckerDelta;
+
+derivatives =
+{
+ PDstandardNth[i_] -> StandardCenteredDifferenceOperator[1,fdOrder/2,i],
+ PDstandardNth[i_,i_] -> StandardCenteredDifferenceOperator[2,fdOrder/2,i],
+ PDstandardNth[i_,j_] -> StandardCenteredDifferenceOperator[1,fdOrder/2,i] *
+ StandardCenteredDifferenceOperator[1,fdOrder/2,j],
+ PDdissipationNth[i_] ->
+ (-1)^(fdOrder/2) *
+ spacing[i]^(fdOrder+1) / 2^(fdOrder+2) *
+ StandardCenteredDifferenceOperator[fdOrder+2,fdOrder/2+1,i],
+
+(* PD: These come from my mathematica notebook
+ "Upwind-Kranc-Convert.nb" that converts upwinding finite
+ differencing operators generated by
+ StandardUpwindDifferenceOperator into this form *)
+
+ Sequence@@Flatten[Table[
+ {PDupwindNth[i] -> Switch[fdOrder,
+ 2, (dir[i]*(-3 + 4*shift[i]^dir[i] - shift[i]^(2*dir[i])))/(2*spacing[i]),
+ 4, (dir[i]*(-10 - 3/shift[i]^dir[i] + 18*shift[i]^dir[i] -
+ 6*shift[i]^(2*dir[i]) + shift[i]^(3*dir[i])))/(12*spacing[i]),
+ 6, (dir[i]*(-35 + 2/shift[i]^(2*dir[i]) - 24/shift[i]^dir[i] + 80*shift[i]^dir[i] -
+ 30*shift[i]^(2*dir[i]) + 8*shift[i]^(3*dir[i]) - shift[i]^(4*dir[i])))/(60*spacing[i]),
+ 8, (dir[i]*(-378 - 5/shift[i]^(3*dir[i]) + 60/shift[i]^(2*dir[i]) - 420/shift[i]^dir[i] +
+ 1050*shift[i]^dir[i] - 420*shift[i]^(2*dir[i]) + 140*shift[i]^(3*dir[i]) - 30*shift[i]^(4*dir[i]) +
+ 3*shift[i]^(5*dir[i])))/(840*spacing[i])],
+
+ PDupwindNthAnti[i] -> Switch[fdOrder,
+ 2, (+1 shift[i]^(-2) -4 shift[i]^(-1) +0 shift[i]^( 0) +4 shift[i]^(+1) -1 shift[i]^(+2)) / (4 spacing[i]),
+ 4, (-1 shift[i]^(-3) +6 shift[i]^(-2) -21 shift[i]^(-1 )+0 shift[i]^( 0) +21 shift[i]^(+1)
+ -6 shift[i]^(+2) +1 shift[i]^(+3)) / (24 spacing[i]),
+ 6, (+1 shift[i]^(-4) -8 shift[i]^(-3) +32 shift[i]^(-2) -104 shift[i]^(-1) +0 shift[i]^( 0)
+ +104 shift[i]^(+1) -32 shift[i]^(+2) +8 shift[i]^(+3) -1 shift[i]^(+4)) / (120 spacing[i]),
+ 8, (-3 shift[i]^(-5) +30 shift[i]^(-4) -145 shift[i]^(-3) +480 shift[i]^(-2) -1470 shift[i]^(-1)
+ +0 shift[i]^( 0) +1470 shift[i]^(+1) -480 shift[i]^(+2) +145 shift[i]^(+3) -30 shift[i]^(+4)
+ +3 shift[i]^(+5)) / (1680 spacing[i])],
+
+ PDupwindNthSymm[i] -> Switch[fdOrder,
+ 2, (-1 shift[i]^(-2) +4 shift[i]^(-1) -6 shift[i]^( 0) +4 shift[i]^(+1) -1 shift[i]^(+2)) / (4 spacing[i]),
+ 4, (+1 shift[i]^(-3) -6 shift[i]^(-2) +15 shift[i]^(-1) -20 shift[i]^( 0) +15 shift[i]^(+1)
+ -6 shift[i]^(+2) +1 shift[i]^(+3)) / (24 spacing[i]),
+ 6, (-1 shift[i]^(-4) +8 shift[i]^(-3) - 28 shift[i]^(-2)+56 shift[i]^(-1)-70 shift[i]^( 0)
+ +56 shift[i]^(+1) -28 shift[i]^(+2) +8 shift[i]^(+3) -1 shift[i]^(+4)) / (120 spacing[i]),
+ 8, (+3 shift[i]^(-5) -30 shift[i]^(-4) +135 shift[i]^(-3) -360 shift[i]^(-2) +630 shift[i]^(-1)
+ -756 shift[i]^( 0) +630 shift[i]^(+1) -360 shift[i]^(+2) +135 shift[i]^(+3) -30 shift[i]^(+4)
+ +3 shift[i]^(+5)) / (1680 spacing[i])],
+
+ (* TODO: make these higher order stencils *)
+ PDonesided[i] -> dir[i] (-1 + shift[i]^dir[i]) / spacing[i]} /. i->j, {j,1,3}],1]
+};
+
+PD = PDstandardNth;
+PDu = PDupwindNth;
+PDua = PDupwindNthAnti;
+PDus = PDupwindNthSymm;
+(* PDo = PDonesided; *)
+PDdiss = PDdissipationNth;
+
+If [splitUpwindDerivs,
+ Upwind[dir_, var_, idx_] := dir PDua[var,idx] + Abs[dir] PDus[var,idx],
+ Upwind[dir_, var_, idx_] := dir PDu[var,idx]];
+
+
+
+(******************************************************************************)
+(* Tensors *)
+(******************************************************************************)
+
+(* Register the tensor quantities with the TensorTools package *)
+Map [DefineTensor,
+ {normal, tangentA, tangentB, dir,
+ nn, nu, nlen, nlen2, su, vg,
+ xx, rr, th, ph,
+ admg, admK, admalpha, admdtalpha, qmsw, admbeta, admdtbeta, H, M, term1, term2, term3,
+ g, detg, gu, G, R, trR, Km, trK, cdphi, cdphi2,
+ phi, gt, At, Xt, Xtn, Theta, Z,
+ (*
+ alpha, A,
+ *)
+ alpha,
+ beta, B, Atm, Atu, trA, Ats, trAts,
+ Kdot, Xtdot, phidot, K, Km,
+ dottrK, dotXt,
+ cXt, cS, cA,
+ e4phi, em4phi, ddetg, detgt, gtu, ddetgt, dgtu, ddgtu, Gtl, Gtlu, Gt, Ddetgt,
+ Rt, Rphi, gK,
+ T00, T0, T, rho, S,
+ x, y, z, r,
+ epsdiss}];
+
+(* NOTE: It seems as if Lie[.,.] did not take these tensor weights
+ into account. Presumably, CD[.,.] and CDt[.,.] don't do this either. *)
+SetTensorAttribute[phi, TensorWeight, +1/6];
+SetTensorAttribute[gt, TensorWeight, -2/3];
+SetTensorAttribute[Xt, TensorWeight, +2/3];
+SetTensorAttribute[At, TensorWeight, -2/3];
+SetTensorAttribute[cXt, TensorWeight, +2/3];
+SetTensorAttribute[cS, TensorWeight, +2 ];
+
+Map [AssertSymmetricIncreasing,
+ {admg[la,lb], admK[la,lb], g[la,lb], K[la,lb], R[la,lb], cdphi2[la,lb],
+ gt[la,lb], At[la,lb], Ats[la,lb], Rt[la,lb], Rphi[la,lb], T[la,lb], Kdot[la, lb]}];
+AssertSymmetricIncreasing [G[ua,lb,lc], lb, lc];
+AssertSymmetricIncreasing [Gtl[la,lb,lc], lb, lc];
+AssertSymmetricIncreasing [Gt[ua,lb,lc], lb, lc];
+AssertSymmetricIncreasing [gK[la,lb,lc], la, lb];
+Map [AssertSymmetricIncreasing,
+ {gu[ua,ub], gtu[ua,ub], Atu[ua,ub]}];
+AssertSymmetricIncreasing [dgtu[ua,ub,lc], ua, ub];
+AssertSymmetricIncreasing [ddgtu[ua,ub,lc,ld], ua, ub];
+AssertSymmetricIncreasing [ddgtu[ua,ub,lc,ld], lc, ld];
+
+DefineConnection [CD, PD, G];
+DefineConnection [CDt, PD, Gt];
+
+(* Use the CartGrid3D variable names *)
+x1=x; x2=y; x3=z;
+
+(* Use the ADMBase variable names *)
+admg11=gxx; admg12=gxy; admg22=gyy; admg13=gxz; admg23=gyz; admg33=gzz;
+admK11=kxx; admK12=kxy; admK22=kyy; admK13=kxz; admK23=kyz; admK33=kzz;
+admalpha=alp;
+admdtalpha=dtalp;
+admbeta1=betax; admbeta2=betay; admbeta3=betaz;
+admdtbeta1=dtbetax; admdtbeta2=dtbetay; admdtbeta3=dtbetaz;
+qmsw=W;
+(*alpha=admalpha;*)
+
+(* Use the TmunuBase variable names *)
+T00=eTtt;
+T01=eTtx; T02=eTty; T03=eTtz;
+T11=eTxx; T12=eTxy; T22=eTyy; T13=eTxz; T23=eTyz; T33=eTzz;
+
+
+
+(******************************************************************************)
+(* Expressions *)
+(******************************************************************************)
+
+(* enum constants for conformalMethod; these must be consistent
+ with the definition of the Cactus parameter conformalMethod *)
+CMphi = 0;
+CMW = 1;
+
+detgExpr = Det [MatrixOfComponents [g [la,lb]]];
+ddetgExpr[la_] =
+ Sum [D[Det[MatrixOfComponents[g[la, lb]]], X] PD[X, la],
+ {X, Union[Flatten[MatrixOfComponents[g[la, lb]]]]}];
+
+detgtExpr = Det [MatrixOfComponents [gt[la,lb]]];
+ddetgtExpr[la_] =
+ Sum [D[Det[MatrixOfComponents[gt[la, lb]]], X] PD[X, la],
+ {X, Union[Flatten[MatrixOfComponents[gt[la, lb]]]]}];
+
+etaExpr = SpatialBetaDriverRadius / Max [r, SpatialBetaDriverRadius];
+thetaExpr = Min [Exp [1 - r / SpatialShiftGammaCoeffRadius], 1];
+
+
+
+(******************************************************************************)
+(* Groups *)
+(******************************************************************************)
+
+evolvedGroups =
+ {SetGroupName [CreateGroupFromTensor [phi ], prefix <> "log_confac"],
+ SetGroupName [CreateGroupFromTensor [gt[la,lb]], prefix <> "metric" ],
+ SetGroupName [CreateGroupFromTensor [Xt[ua] ], prefix <> "Gamma" ],
+ SetGroupName [CreateGroupFromTensor [trK ], prefix <> "trace_curv"],
+ SetGroupName [CreateGroupFromTensor [At[la,lb]], prefix <> "curv" ],
+ SetGroupName [CreateGroupFromTensor [alpha ], prefix <> "lapse" ],
+(*SetGroupName [CreateGroupFromTensor [A ], prefix <> "dtlapse" ],*)
+ SetGroupName [CreateGroupFromTensor [Kdot[la, lb]], prefix <> "Kdot" ],
+ SetGroupName [CreateGroupFromTensor [Xtdot[ua]], prefix <> "Xtdot" ],
+ SetGroupName [CreateGroupFromTensor [phidot], prefix <> "phidot" ],
+ SetGroupName [CreateGroupFromTensor [beta[ua] ], prefix <> "shift" ],
+ SetGroupName [CreateGroupFromTensor [B[ua] ], prefix <> "dtshift" ],
+ IfCCZ4[SetGroupName[CreateGroupFromTensor[Theta], prefix <> "Theta"]]};
+evaluatedGroups =
+ {SetGroupName [CreateGroupFromTensor [H ], prefix <> "Ham"],
+ SetGroupName [CreateGroupFromTensor [M[la] ], prefix <> "mom"],
+ SetGroupName [CreateGroupFromTensor [term1 ], prefix <> "term1"],
+ SetGroupName [CreateGroupFromTensor [term2 ], prefix <> "term2"],
+ SetGroupName [CreateGroupFromTensor [term3 ], prefix <> "term3"],
+ SetGroupName [CreateGroupFromTensor [cS ], prefix <> "cons_detg"],
+ SetGroupName [CreateGroupFromTensor [cXt[ua]], prefix <> "cons_Gamma"],
+ SetGroupName [CreateGroupFromTensor [cA ], prefix <> "cons_traceA"]};
+
+declaredGroups = Join [evolvedGroups, evaluatedGroups];
+declaredGroupNames = Map [First, declaredGroups];
+
+
+
+extraGroups =
+ {{"Grid::coordinates", {x, y, z, r}},
+ {"ADMBase::metric", {gxx, gxy, gxz, gyy, gyz, gzz}},
+ {"ADMBase::curv", {kxx, kxy, kxz, kyy, kyz, kzz}},
+ {"ADMBase::lapse", {alp}},
+ {"ADMBase::dtlapse", {dtalp}},
+ {"ADMBase::shift", {betax, betay, betaz}},
+ {"ADMBase::dtshift", {dtbetax, dtbetay, dtbetaz}},
+ {"QuasiMaximalSlicing::W", { W }},
+ {"TmunuBase::stress_energy_scalar", {eTtt}},
+ {"TmunuBase::stress_energy_vector", {eTtx, eTty, eTtz}},
+ {"TmunuBase::stress_energy_tensor", {eTxx, eTxy, eTxz, eTyy, eTyz, eTzz}}
+};
+
+groups = Join [declaredGroups, extraGroups];
+
+
+
+(******************************************************************************)
+(* Initial data *)
+(******************************************************************************)
+
+initialCalc =
+{
+ Name -> thorn <> "_Minkowski",
+ Schedule -> {"IN ADMBase_InitialData"},
+ ConditionalOnKeyword -> {"my_initial_data", "Minkowski"},
+ Equations ->
+ {
+ phi -> IfThen[conformalMethod==CMW, 1, 0],
+ gt[la,lb] -> KD[la,lb],
+ trK -> 0,
+ At[la,lb] -> 0,
+ Xt[ua] -> 0,
+ (*alpha -> 1,
+ A -> 0,*)
+ beta[ua] -> 0,
+ B[ua] -> 0,
+ IfCCZ4[Theta -> 0]
+ }
+};
+
+
+
+(******************************************************************************)
+(* Split a calculation *)
+(******************************************************************************)
+
+PartialCalculation[calc_, suffix_, updates_, evolVars_] :=
+Module[
+ {name, calc1, replaces, calc2, vars, patterns, eqs, calc3},
+ (* Add suffix to name *)
+ name = lookup[calc, Name] <> suffix;
+ calc1 = mapReplace[calc, Name, name];
+ (* Replace some entries in the calculation *)
+ (* replaces = Map[Function[rule, mapReplace[#, rule[[1]], rule[[2]]]&], updates]; *)
+ replaces = updates //. (lhs_ -> rhs_) -> (mapReplace[#, lhs, rhs]&);
+ calc2 = Apply[Composition, replaces][calc1];
+ (* Remove unnecessary equations *)
+ vars = Join[evolVars, lookup[calc2, Shorthands]];
+ patterns = Replace[vars, { Tensor[n_,__] -> Tensor[n,__] ,
+ dot[Tensor[n_,__]] -> dot[Tensor[n,__]]}, 1];
+ eqs = FilterRules[lookup[calc, Equations], patterns];
+ calc3 = mapReplace[calc2, Equations, eqs];
+ calc3
+];
+
+
+
+(******************************************************************************)
+(* Convert from ADMBase *)
+(******************************************************************************)
+
+convertFromADMBaseCalc =
+{
+ Name -> thorn <> "_convertFromADMBase",
+ Schedule -> {"AT initial AFTER ADMBase_PostInitial"},
+ ConditionalOnKeyword -> {"my_initial_data", "ADMBase"},
+ Shorthands -> {g[la,lb], detg, gu[ua,ub], em4phi},
+ Equations ->
+ {
+ g[la,lb] -> admg[la,lb],
+ detg -> detgExpr,
+ gu[ua,ub] -> 1/detg detgExpr MatrixInverse [g[ua,ub]],
+
+ phi -> IfThen[conformalMethod==CMW, detg^(-1/6), Log[detg]/12],
+ em4phi -> IfThen[conformalMethod==CMW, phi^2, Exp[-4 phi]],
+ gt[la,lb] -> em4phi g[la,lb],
+
+ trK -> gu[ua,ub] admK[la,lb],
+ At[la,lb] -> em4phi (admK[la,lb] - (1/3) g[la,lb] trK),
+
+ alpha -> admalpha,
+
+ beta[ua] -> admbeta[ua],
+
+ IfCCZ4[Theta -> 0]
+ }
+};
+
+convertFromADMBaseGammaCalc =
+{
+ Name -> thorn <> "_convertFromADMBaseGamma",
+ Schedule -> {"AT initial AFTER " <> thorn <> "_convertFromADMBase"},
+ ConditionalOnKeyword -> {"my_initial_data", "ADMBase"},
+ (*
+ Where -> InteriorNoSync,
+ *)
+ (* Do not synchronise right after this routine; instead, synchronise
+ after extrapolating *)
+ Where -> Interior,
+ (* Synchronise after this routine, so that the refinement boundaries
+ are set correctly before extrapolating. (We will need to
+ synchronise again after extrapolating because extrapolation does
+ not fill ghost zones, but this is irrelevant here.) *)
+ Shorthands -> {dir[ua],
+ detgt, gtu[ua,ub], Gt[ua,lb,lc], theta},
+ Equations ->
+ {
+ dir[ua] -> Sign[beta[ua]],
+
+ detgt -> 1 (* detgtExpr *),
+ gtu[ua,ub] -> 1/detgt detgtExpr MatrixInverse [gt[ua,ub]],
+ Gt[ua,lb,lc] -> 1/2 gtu[ua,ud]
+ (PD[gt[lb,ld],lc] + PD[gt[lc,ld],lb] - PD[gt[lb,lc],ld]),
+ Xt[ua] -> gtu[ub,uc] Gt[ua,lb,lc],
+
+(*
+ A -> - admdtalpha / (harmonicF alpha^harmonicN) (LapseAdvectionCoeff - 1),
+*)
+ (* If LapseACoeff=0, then A is not evolved, in the sense that it
+ does not influence the time evolution of other variables. *)
+ (*A -> IfThen [LapseACoeff != 0,
+ 1 / (- harmonicF alpha^harmonicN)
+ (+ admdtalpha
+ - LapseAdvectionCoeff Upwind[beta[ua], alpha, la]),
+ 0],*)
+
+ theta -> thetaExpr,
+
+ (* If ShiftBCoeff=0 or theta ShiftGammaCoeff=0, then B^i is not
+ evolved, in the sense that it does not influence the time
+ evolution of other variables. *)
+ B[ua] -> IfThen [ShiftGammaCoeff ShiftBCoeff != 0,
+ 1 / (theta ShiftGammaCoeff)
+ (+ admdtbeta[ua]
+ - ShiftAdvectionCoeff Upwind[beta[ub], beta[ua], lb]),
+ 0]
+ }
+};
+
+(* Initialise the Gamma variables to 0. This is necessary with
+ multipatch because convertFromADMBaseGamma does not perform the
+ conversion in the boundary points, and the order in which symmetry
+ (interpatch) and outer boundary conditions is applied means that
+ points which are both interpatch and symmetry points are never
+ initialised. *)
+initGammaCalc =
+{
+ Name -> thorn <> "_InitGamma",
+ Schedule -> {"AT initial BEFORE " <> thorn <> "_convertFromADMBaseGamma"},
+ ConditionalOnKeyword -> {"my_initial_data", "ADMBase"},
+ Where -> Everywhere,
+ Equations ->
+ {
+ Xt[ua] -> 0,
+ (*A -> 0,*)
+ B[ua] -> 0
+ }
+};
+
+
+
+(******************************************************************************)
+(* Convert to ADMBase *)
+(******************************************************************************)
+
+convertToADMBaseCalc =
+{
+ Name -> thorn <> "_convertToADMBase",
+ Schedule -> {"IN " <> thorn <> "_convertToADMBaseGroup"},
+ Where -> Everywhere,
+ Shorthands -> {e4phi},
+ Equations ->
+ {
+ e4phi -> IfThen[conformalMethod==CMW, 1/phi^2, Exp[4 phi]],
+ admg[la,lb] -> e4phi gt[la,lb],
+ admK[la,lb] -> e4phi At[la,lb] + (1/3) admg[la,lb] trK,
+ admalpha -> alpha,
+ admbeta[ua] -> beta[ua]
+ }
+};
+
+convertToADMBaseDtLapseShiftCalc =
+{
+ Name -> thorn <> "_convertToADMBaseDtLapseShift",
+ Schedule -> {"IN " <> thorn <> "_convertToADMBaseGroup"},
+ ConditionalOnKeyword -> {"dt_lapse_shift_method", "correct"},
+ Where -> Interior,
+ Shorthands -> {dir[ua], detgt, gtu[ua,ub], eta, theta, em4phi, Ddetgt[la]},
+ Equations ->
+ {
+ dir[ua] -> Sign[beta[ua]],
+
+ detgt -> 1 (* detgtExpr *),
+ (* This leads to simpler code... *)
+ gtu[ua,ub] -> 1/detgt detgtExpr MatrixInverse [gt[ua,ub]],
+ em4phi -> IfThen[conformalMethod==CMW, phi^2, Exp[-4 phi]],
+
+ eta -> etaExpr,
+ theta -> thetaExpr,
+
+ (* Ddetgt should be zero analytically, but we're not assuming it here. Change commenting to assume it.*)
+ Ddetgt[la] -> gtu[uk,ul] PD[gt[lk,ll],la],
+ (*Ddetgt[la] -> 0,*)
+
+ (* see RHS *)
+(*
+ admdtalpha -> - harmonicF alpha^harmonicN
+ ((1 - LapseAdvectionCoeff) A + LapseAdvectionCoeff trK)
+ + LapseAdvectionCoeff beta[ua] PDu[alpha,la],
+ admdtalpha -> - harmonicF alpha^harmonicN
+ (+ LapseACoeff A
+ + ((1 - LapseACoeff)
+ (trK - IfCCZ4[2 Theta, 0])))
+ + LapseAdvectionCoeff Upwind[beta[ua], alpha, la],
+*)
+ admdtbeta[ua] -> IfThen[harmonicShift,
+ - 1/2 gtu[ua,uj] em4phi alpha
+ (- 2 alpha IfThen[conformalMethod==CMW,1/phi,-2] PD[phi,lj]
+ + 2 PD[alpha,lj]
+ + alpha (Ddetgt[lj] - 2 gtu[uk,ul] PD[gt[lj,lk],ll])),
+ (* else *)
+ + theta ShiftGammaCoeff
+ (+ ShiftBCoeff B[ua]
+ + (1 - ShiftBCoeff)
+ (Xt[ua] - eta BetaDriver beta[ua]))]
+ + ShiftAdvectionCoeff Upwind[beta[ub], beta[ua], lb]
+ }
+};
+
+convertToADMBaseDtLapseShiftBoundaryCalc =
+{
+ Name -> thorn <> "_convertToADMBaseDtLapseShiftBoundary",
+ Schedule -> {"IN " <> thorn <> "_convertToADMBaseGroup"},
+ ConditionalOnKeyword -> {"dt_lapse_shift_method", "correct"},
+ Where -> BoundaryWithGhosts,
+ Shorthands -> {detgt, gtu[ua,ub], eta, theta},
+ Equations ->
+ {
+ detgt -> 1 (* detgtExpr *),
+ (* This leads to simpler code... *)
+ gtu[ua,ub] -> 1/detgt detgtExpr MatrixInverse [gt[ua,ub]],
+
+ eta -> etaExpr,
+ theta -> thetaExpr,
+
+ (* see RHS, but omit derivatives near the boundary *)
+(*
+ admdtalpha -> - harmonicF alpha^harmonicN
+ ((1 - LapseAdvectionCoeff) A + LapseAdvectionCoeff trK),
+ admdtalpha -> - harmonicF alpha^harmonicN
+ (+ LapseACoeff A
+ + ((1 - LapseACoeff)
+ (trK - IfCCZ4[2 Theta, 0]))),
+*)
+ admdtbeta[ua] -> IfThen[harmonicShift,
+ 0,
+ (* else *)
+ + theta ShiftGammaCoeff
+ (+ ShiftBCoeff B[ua]
+ + (1 - ShiftBCoeff)
+ (Xt[ua] - eta BetaDriver beta[ua]))]
+ }
+};
+
+convertToADMBaseFakeDtLapseShiftCalc =
+{
+ Name -> thorn <> "_convertToADMBaseFakeDtLapseShift",
+ Schedule -> {"IN " <> thorn <> "_convertToADMBaseGroup"},
+ ConditionalOnKeyword -> {"dt_lapse_shift_method", "noLapseShiftAdvection"},
+ Where -> Everywhere,
+ Shorthands -> {detgt, gtu[ua,ub], eta, theta},
+ Equations ->
+ {
+ detgt -> 1 (* detgtExpr *),
+ (* This leads to simpler code... *)
+ gtu[ua,ub] -> 1/detgt detgtExpr MatrixInverse [gt[ua,ub]],
+
+ eta -> etaExpr,
+ theta -> thetaExpr,
+
+ (* see RHS, but omit derivatives everywhere (which is wrong, but
+ faster, since it does not require synchronisation or boundary
+ conditions) *)
+(*
+ admdtalpha -> - harmonicF alpha^harmonicN
+ ((1 - LapseAdvectionCoeff) A + LapseAdvectionCoeff trK),
+ admdtalpha -> - harmonicF alpha^harmonicN
+ (+ LapseACoeff A
+ + ((1 - LapseACoeff)
+ (trK - IfCCZ4[2 Theta, 0]))),
+*)
+ admdtbeta[ua] -> IfThen[harmonicShift,
+ 0,
+ (* else *)
+ + theta ShiftGammaCoeff
+ (+ ShiftBCoeff B[ua]
+ + (1 - ShiftBCoeff)
+ (Xt[ua] - eta BetaDriver beta[ua]))]
+ }
+};
+
+(******************************************************************************)
+(* Evolution equations *)
+(******************************************************************************)
+
+evolCalc =
+{
+ Name -> thorn <> "_RHS",
+ Schedule -> {"IN " <> thorn <> "_evolCalcGroup"},
+ (*
+ Where -> Interior,
+ *)
+ (* Synchronise the RHS grid functions after this routine, so that
+ the refinement boundaries are set correctly before applying the
+ radiative boundary conditions. *)
+ Where -> InteriorNoSync,
+ Shorthands -> {dir[ua],
+ detgt, gtu[ua,ub],
+ Gt[ua,lb,lc], Gtl[la,lb,lc], Gtlu[la,lb,uc], G[ua, lb, lc], ddetg[la], Xtn[ua],
+ Rt[la,lb], Rphi[la,lb], R[la,lb],
+ Atm[ua,lb], Atu[ua,ub],
+ e4phi, em4phi, cdphi[la], cdphi2[la,lb], g[la,lb], detg,
+ gu[ua,ub], Ats[la,lb], trAts, eta, theta,
+ K[la, lb], Km[la, ub],
+ rho, S[la], trS, fac1, fac2, dottrK, dotXt[ua],
+ epsdiss[ua], IfCCZ4[Z[ua]], IfCCZ4[dotTheta], Ddetgt[la]},
+ Equations ->
+ {
+ dir[ua] -> Sign[beta[ua]],
+
+ detgt -> 1 (* detgtExpr *),
+
+ (* This leads to simpler code... *)
+ gtu[ua,ub] -> 1/detgt detgtExpr MatrixInverse [gt[ua,ub]],
+ Gtl[la,lb,lc] -> 1/2
+ (PD[gt[lb,la],lc] + PD[gt[lc,la],lb] - PD[gt[lb,lc],la]),
+ Gtlu[la,lb,uc] -> gtu[uc,ud] Gtl[la,lb,ld],
+ Gt[ua,lb,lc] -> gtu[ua,ud] Gtl[ld,lb,lc],
+
+ (* The conformal connection functions calculated from the conformal metric,
+ used instead of Xt where no derivatives of Xt are taken *)
+ Xtn[ui] -> gtu[uj,uk] Gt[ui,lj,lk],
+
+ e4phi -> IfThen[conformalMethod==CMW, 1/phi^2, Exp[4 phi]],
+ em4phi -> 1 / e4phi,
+ g[la,lb] -> e4phi gt[la,lb],
+ detg -> detgExpr,
+ gu[ua,ub] -> em4phi gtu[ua,ub],
+ ddetg[la] -> 4 detgt e4phi PD[phi,la],
+ G[ua,lb,lc] -> Gt[ua,lb,lc]
+ + 1/(2 detg) (+ KD[ua,lb] ddetg[lc] + KD[ua,lc] ddetg[lb]
+ - (1/3) g[lb,lc] gu[ua,ud] ddetg[ld]),
+ K[la, lb] -> e4phi At[la, lb] + (1/3) g[la, lb] trK,
+ Km[la, ub] -> gu[ub, uc] K[la, lc],
+
+ (* The Z quantities *)
+ (* gr-qc:1106.2254 (2011), eqn. (23) *)
+ IfCCZ4[
+ Z[ud] -> (1/2) gu[ua,ud] (- PD[gt[la,lb],lc] gtu[ub,uc] + gt[la,lc] Xt[uc])
+ ],
+
+ (* PRD 62, 044034 (2000), eqn. (18) *)
+ (* Adding Z term by changing Xtn to Xt *)
+ Rt[li,lj] -> - (1/2) gtu[ul,um] PD[gt[li,lj],ll,lm]
+ + (1/2) gt[lk,li] PD[Xt[uk],lj]
+ + (1/2) gt[lk,lj] PD[Xt[uk],li]
+ + (1/2) Xtn[uk] Gtl[li,lj,lk]
+ + (1/2) Xtn[uk] Gtl[lj,li,lk]
+ + (+ Gt[uk,li,ll] Gtlu[lj,lk,ul]
+ + Gt[uk,lj,ll] Gtlu[li,lk,ul]
+ + Gt[uk,li,ll] Gtlu[lk,lj,ul]),
+
+ fac1 -> IfThen[conformalMethod==CMW, -1/(2 phi), 1],
+ cdphi[la] -> fac1 CDt[phi,la],
+ fac2 -> IfThen[conformalMethod==CMW, 1/(2 phi^2), 0],
+ cdphi2[la,lb] -> fac1 CDt[phi,la,lb] + fac2 CDt[phi,la] CDt[phi,lb],
+
+ (* PRD 62, 044034 (2000), eqn. (15) *)
+ Rphi[li,lj] -> - 2 cdphi2[lj,li]
+ - 2 gt[li,lj] gtu[ul,un] cdphi2[ll,ln]
+ + 4 cdphi[li] cdphi[lj]
+ - 4 gt[li,lj] gtu[ul,un] cdphi[ln] cdphi[ll],
+
+ Atm[ua,lb] -> gtu[ua,uc] At[lc,lb],
+ Atu[ua,ub] -> Atm[ua,lc] gtu[ub,uc],
+
+ R[la,lb] -> Rt[la,lb] + Rphi[la,lb],
+ IfCCZ4[
+ R[la,lb] -> R[la,lb] + (2/phi) (+ g[la,lc] Z[uc] PD[phi,lb]
+ + g[lb,lc] Z[uc] PD[phi,la] - g[la,lb] Z[uc] PD[phi,lc])
+ + e4phi Z[uc] PD[gt[la,lb],lc]
+ ],
+
+ (* Matter terms *)
+
+ (* rho = n^a n^b T_ab *)
+ rho -> addMatter
+ (1/alpha^2 (T00 - 2 beta[ui] T0[li] + beta[ui] beta[uj] T[li,lj])),
+
+ (* S_i = -p^a_i n^b T_ab, where p^a_i = delta^a_i + n^a n_i *)
+ S[li] -> addMatter (-1/alpha (T0[li] - beta[uj] T[li,lj])),
+
+ (* trS = gamma^ij T_ij *)
+ trS -> addMatter (em4phi gtu[ui,uj] T[li,lj]),
+
+ (* RHS terms *)
+
+ (* PRD 62, 044034 (2000), eqn. (10) *)
+ (* PRD 67 084023 (2003), eqn. (16) and (23) *)
+ dot[phi] -> IfThen[conformalMethod==CMW, 1/3 phi, -1/6]
+ (alpha trK - PD[beta[ua],la]),
+ phidot -> IfThen[conformalMethod==CMW, 1/3 phi, -1/6]
+ (alpha trK - PD[beta[ua],la]),
+
+ (* PRD 62, 044034 (2000), eqn. (9) *)
+ (* gr-qc:1106.2254 (2011), eqn. (14) *)
+ (* removing trA from Aij ensures that detg = 1 *)
+ dot[gt[la,lb]] -> - 2 alpha (At[la,lb] - IfCCZ4[(1/3) At[lc,ld] gtu[uc,ud] gt[la,lb], 0])
+ + gt[la,lc] PD[beta[uc],lb] + gt[lb,lc] PD[beta[uc],la]
+ - (2/3) gt[la,lb] PD[beta[uc],lc],
+ (* PRD 62, 044034 (2000), eqn. (20) *)
+ (* PRD 67 084023 (2003), eqn (26) *)
+ (* gr-qc:1106.2254 (2011), eqn. (19) *)
+ (* Adding Z terms by changing Xtn to Xt,
+ also adding extra Z and Theta terms *)
+ dotXt[ui] -> - 2 Atu[ui,uj] PD[alpha,lj]
+ + 2 alpha (+ Gt[ui,lj,lk] Atu[uk,uj]
+ - (2/3) gtu[ui,uj] PD[trK,lj]
+ + 6 Atu[ui,uj] cdphi[lj])
+ + gtu[uj,ul] PD[beta[ui],lj,ll]
+ + (1/3) gtu[ui,uj] PD[beta[ul],lj,ll]
+ - Xtn[uj] PD[beta[ui],lj]
+ + (2/3) Xtn[ui] PD[beta[uj],lj]
+ + IfCCZ4[
+ + GammaShift 2 e4phi (- Z[uj] PD[beta[ui],lj]
+ + (2/3) Z[ui] PD[beta[uj],lj])
+ - (4/3) alpha e4phi Z[ui] trK
+ + 2 gtu[ui,uj] (+ alpha PD[Theta,lj]
+ - Theta PD[alpha,lj])
+ - 2 alpha e4phi dampk1 Z[ui],
+ 0]
+ (* Equation (4.28) in Baumgarte & Shapiro (Phys. Rept. 376 (2003) 41-131) *)
+ + addMatter (- 16 Pi alpha gtu[ui,uj] S[lj]),
+ dot[Xt[ui]] -> dotXt[ui],
+ Xtdot[ui] -> dotXt[ui],
+
+ (* gr-qc:1106.2254 (2011), eqn. (18) *)
+ IfCCZ4[
+ dotTheta ->
+ - PD[alpha,la] Z[ua] - dampk1 (2 + dampk2) alpha Theta
+ + (1/2) alpha (gu[ua,ub] R[la,lb] - Atm[ua,lb] Atm[ub,la] + (2/3) trK^2 - 2 trK Theta)
+ + addMatter (- 8 Pi alpha rho)
+ ],
+
+ IfCCZ4[
+ dot[Theta] -> dotTheta
+ ],
+
+ (* PRD 62, 044034 (2000), eqn. (11) *)
+ (* gr-qc:1106.2254 (2011), eqn. (17) *)
+ (* Adding the RHS of Theta to K, because K_Z4 = K_BSSN + 2 Theta *)
+ (* Also adding the Z term, as it has to cancel with the one in Theta *)
+ (*dottrK -> - em4phi ( gtu[ua,ub] ( PD[alpha,la,lb]
+ + 2 cdphi[la] PD[alpha,lb] )
+ - Xtn[ua] PD[alpha,la] )
+ + alpha (Atm[ua,lb] Atm[ub,la] + (1/3) trK^2)
+ + IfCCZ4[
+ + 2 dotTheta + 2 PD[alpha,la] Z[ua]
+ + dampk1 (1 - dampk2) alpha Theta,
+ 0]*)
+ term1 -> - em4phi ( gtu[ua,ub] ( PD[alpha,la,lb]
+ + 2 cdphi[la] PD[alpha,lb] )
+ - Xtn[ua] PD[alpha,la] ),
+ term2 -> + alpha (Atm[ua,lb] Atm[ub,la] + (1/3) trK^2),
+ term3 -> IfCCZ4[+ 2 dotTheta + 2 PD[alpha,la] Z[ua]
+ + dampk1 (1 - dampk2) alpha Theta, 0],
+ dottrK -> term1 + term2 + term3
+ (* Equation (4.21) in Baumgarte & Shapiro (Phys. Rept. 376 (2003) 41-131) *)
+ + addMatter (4 Pi alpha (rho + trS)),
+ dot[trK] -> KEvolFactor dottrK,
+
+ (* PRD 62, 044034 (2000), eqn. (12) *)
+ (* TODO: Should we use the Hamiltonian constraint to make Rij tracefree? *)
+ (* gr-qc:1106.2254 (2011), eqn. (15) *)
+ (* Adding Z terms in the Ricci and Theta terms *)
+ Ats[la,lb] -> - CDt[alpha,la,lb] +
+ + 2 (PD[alpha,la] cdphi[lb] + PD[alpha,lb] cdphi[la] )
+ + alpha R[la,lb],
+ trAts -> gu[ua,ub] Ats[la,lb],
+ dot[At[la,lb]] -> + em4phi (+ Ats[la,lb] - (1/3) g[la,lb] trAts )
+ + alpha (+ ((trK - IfCCZ4[2 Theta, 0])
+ At[la,lb])
+ - 2 At[la,lc] Atm[uc,lb])
+ + At[la,lc] PD[beta[uc],lb] + At[lb,lc] PD[beta[uc],la]
+ - (2/3) At[la,lb] PD[beta[uc],lc]
+ (* Equation (4.23) in Baumgarte & Shapiro (Phys. Rept. 376 (2003) 41-131) *)
+ + addMatter (- em4phi alpha 8 Pi
+ (T[la,lb] - (1/3) g[la,lb] trS)),
+ Kdot[la, lb] -> -CD[alpha, la, lb] + alpha (R[la, lb] + trK K[la, lb] - 2 K[la, lc] Km[lb, uc]),
+
+
+ eta -> etaExpr,
+ theta -> thetaExpr,
+
+ (* Ddetgt should be zero analytically, but we're not assuming it here. Change commenting to assume it.*)
+ Ddetgt[la] -> gtu[uk,ul] PD[gt[lk,ll],la],
+ (*Ddetgt[la] -> 0,*)
+
+ (* dot[beta[ua]] -> eta Xt[ua], *)
+ (* dot[beta[ua]] -> ShiftGammaCoeff alpha^ShiftAlphaPower B[ua], *)
+ dot[beta[ua]] -> IfThen[harmonicShift,
+ - 1/2 gtu[ua,uj] em4phi alpha
+ (- 2 alpha IfThen[conformalMethod==CMW,1/phi,-2] PD[phi,lj]
+ + 2 PD[alpha,lj]
+ + alpha (Ddetgt[lj] - 2 gtu[uk,ul] PD[gt[lj,lk],ll])),
+ (* else *)
+ + theta ShiftGammaCoeff
+ (+ ShiftBCoeff B[ua]
+ + (1 - ShiftBCoeff)
+ (Xt[ua] - eta BetaDriver beta[ua]))],
+
+ dot[B[ua]] -> + ShiftBCoeff (dotXt[ua] - eta BetaDriver B[ua])
+ (* Note that this dotXt[ua] is not yet \partial_t \tilde \Gamma^i, because the
+ advection term has not yet been added. It is actually
+ \partial_t \tilde \Gamma^i - \beta^j \partial_j \tilde \Gamma^i *)
+ }
+};
+
+lapseEvolCalc = {
+ Name -> thorn <> "_lapse_evol",
+ Schedule -> {"IN " <> thorn <> "_evolCalcGroup"},
+ (*
+ Where -> Interior,
+ *)
+ (* Synchronise the RHS grid functions after this routine, so that
+ the refinement boundaries are set correctly before applying the
+ radiative boundary conditions. *)
+ Where -> InteriorNoSync,
+ Shorthands -> {},
+ Equations ->
+ {
+ dot[alpha] -> - harmonicF alpha^harmonicN (+ trK - IfCCZ4[2 Theta, 0] + AlphaDriver (alpha - 1)) + WFactor qmsw
+ }
+};
+
+advectCalc =
+{
+ Name -> thorn <> "_Advect",
+ Schedule -> {"IN " <> thorn <> "_evolCalcGroup " <>
+ "AFTER (" <> thorn <> "_RHS " <> thorn <> "_lapse_evol " <> thorn <> "_RHS1 " <> thorn <> "_RHS2)"},
+ (*
+ Where -> Interior,
+ *)
+ (* Synchronise the RHS grid functions after this routine, so that
+ the refinement boundaries are set correctly before applying the
+ radiative boundary conditions. *)
+ Where -> InteriorNoSync,
+ Shorthands -> {dir[ua]},
+ Equations ->
+ {
+ dir[ua] -> Sign[beta[ua]],
+
+ dot[phi] -> dot[phi] + Upwind[beta[ua], phi, la],
+
+ dot[gt[la,lb]] -> dot[gt[la,lb]] + Upwind[beta[uc], gt[la,lb], lc],
+
+ dot[Xt[ui]] -> dot[Xt[ui]] + Upwind[beta[uj], Xt[ui], lj],
+
+ IfCCZ4[
+ dot[Theta] -> dot[Theta] + Upwind[beta[ua], Theta, la]
+ ],
+
+ dot[trK] -> dot[trK] + Upwind[beta[ua], trK, la],
+
+ dot[At[la,lb]] -> dot[At[la,lb]] + Upwind[beta[uc], At[la,lb], lc],
+
+ (*
+ dot[alpha] -> dot[alpha]
+ + LapseAdvectionCoeff Upwind[beta[ua], alpha, la],
+
+ dot[A] -> dot[A]
+ + LapseACoeff (
+ + LapseAdvectionCoeff Upwind[beta[ua], A, la]
+ + (1 - LapseAdvectionCoeff) Upwind[beta[ua], trK, la]),
+ *)
+
+ dot[beta[ua]] -> dot[beta[ua]]
+ + ShiftAdvectionCoeff Upwind[beta[ub], beta[ua], lb],
+
+ dot[B[ua]] -> dot[B[ua]]
+ + ShiftBCoeff (
+ + ShiftAdvectionCoeff Upwind[beta[ub], B[ua], lb]
+ + ((1 - ShiftAdvectionCoeff)
+ Upwind[beta[ub], Xt[ua], lb]))
+ (* Note that the advection term \beta^j \partial_j \tilde \Gamma^i is not
+ subtracted here when ShiftAdvectionCoefficient == 1 because it was
+ implicitly subtracted before (see comment in previous calculation of
+ dot[B[ua]]. *)
+ }
+};
+
+evolCalc1 = PartialCalculation[evolCalc, "1",
+ {
+ ConditionalOnKeyword -> {"RHS_calculation", "split"}
+ },
+ {
+ dot[phi],
+ dot[gt[la,lb]],
+ dot[Xt[ui]],
+ term1, term2, term3,
+ dot[trK],
+ dot[beta[ua]],
+ dot[B[ua]],
+ IfCCZ4[dot[Theta]]
+ }];
+
+evolCalc2 = PartialCalculation[evolCalc, "2",
+ {
+ ConditionalOnKeyword -> {"RHS_calculation", "split"}
+ },
+ {
+ dot[At[la,lb]]
+ }];
+
+dissCalc =
+{
+ Name -> thorn <> "_Dissipation",
+ Schedule -> {"IN " <> thorn <> "_evolCalcGroup " <>
+ "AFTER (" <> thorn <> "_RHS " <> thorn <> "_RHS1 " <> thorn <> "_RHS2)"},
+ ConditionalOnKeyword -> {"apply_dissipation", "always"},
+ Where -> InteriorNoSync,
+ Shorthands -> {epsdiss[ua]},
+ Equations ->
+ {
+ epsdiss[ua] -> EpsDiss,
+ Sequence@@Table[
+ dot[var] -> dot[var] + epsdiss[ux] PDdiss[var,lx],
+ {var, {phi, gt[la,lb], Xt[ui], IfCCZ4[Theta], trK, At[la,lb],
+ (*alpha, A,*) beta[ua], B[ua]}}]
+ }
+};
+
+dissCalcs =
+Table[
+{
+ Name -> thorn <> "_Dissipation_" <> ToString[var /. {Tensor[n_,__] -> n}],
+ Schedule -> {"IN " <> thorn <> "_evolCalcGroup " <>
+ "AFTER (" <> thorn <> "_RHS " <> thorn <> "_RHS1 " <> thorn <> "_RHS2)"},
+ ConditionalOnKeyword -> {"apply_dissipation", "always"},
+ Where -> InteriorNoSync,
+ Shorthands -> {epsdiss[ua]},
+ Equations ->
+ {
+ epsdiss[ua] -> EpsDiss,
+ dot[var] -> dot[var] + epsdiss[ux] PDdiss[var,lx]
+ }
+},
+ {var, {phi, gt[la,lb], Xt[ui], IfCCZ4[Theta], trK, At[la,lb],
+ (*alpha, A,*) beta[ua], B[ua]}}
+];
+
+RHSStaticBoundaryCalc =
+{
+ Name -> thorn <> "_RHSStaticBoundary",
+ Schedule -> {"IN MoL_CalcRHS"},
+ ConditionalOnKeyword -> {"my_rhs_boundary_condition", "static"},
+ Where -> Boundary,
+ Equations ->
+ {
+ dot[phi] -> 0,
+ dot[gt[la,lb]] -> 0,
+ dot[trK] -> 0,
+ dot[At[la,lb]] -> 0,
+ dot[Xt[ua]] -> 0,
+ (*dot[alpha] -> 0,
+ dot[A] -> 0,*)
+ dot[beta[ua]] -> 0,
+ dot[B[ua]] -> 0,
+ IfCCZ4[dot[Theta] -> 0]
+ }
+};
+
+(* Initialise the RHS variables in analysis in case they are going to
+ be output - the noninterior points cannot be filled, so we define
+ them to be zero *)
+initRHSCalc =
+{
+ Name -> thorn <> "_InitRHS",
+ Schedule -> {"AT analysis BEFORE " <> thorn <> "_evolCalcGroup"},
+ Where -> Everywhere,
+ Equations ->
+ {
+ dot[phi] -> 0,
+ dot[gt[la,lb]] -> 0,
+ dot[trK] -> 0,
+ dot[At[la,lb]] -> 0,
+ dot[Xt[ua]] -> 0,
+ (*dot[alpha] -> 0,
+ dot[A] -> 0,*)
+ dot[beta[ua]] -> 0,
+ dot[B[ua]] -> 0,
+ IfCCZ4[dot[Theta] -> 0]
+ }
+};
+
+RHSRadiativeBoundaryCalc =
+{
+ Name -> thorn <> "_RHSRadiativeBoundary",
+ Schedule -> {"IN MoL_CalcRHS"},
+ ConditionalOnKeyword -> {"my_rhs_boundary_condition", "radiative"},
+ Where -> Boundary,
+ Shorthands -> {dir[ua],
+ detgt, gtu[ua,ub], em4phi, gu[ua,ub],
+ nn[la], nu[ua], nlen, nlen2, su[ua],
+ vg},
+ Equations ->
+ {
+ dir[ua] -> Sign[normal[ua]],
+
+ detgt -> 1 (* detgtExpr *),
+ gtu[ua,ub] -> 1/detgt detgtExpr MatrixInverse [gt[ua,ub]],
+ em4phi -> IfThen[conformalMethod==CMW, phi^2, Exp[-4 phi]],
+ gu[ua,ub] -> em4phi gtu[ua,ub],
+
+ nn[la] -> Euc[la,lb] normal[ub],
+ nu[ua] -> gu[ua,ub] nn[lb],
+ nlen2 -> nu[ua] nn[la],
+ nlen -> Sqrt[nlen2],
+ su[ua] -> nu[ua] / nlen,
+
+ vg -> Sqrt[harmonicF],
+
+ dot[phi] -> - vg su[uc] PDo[phi ,lc],
+ dot[gt[la,lb]] -> - su[uc] PDo[gt[la,lb],lc],
+ dot[trK] -> - vg su[uc] PDo[trK ,lc],
+ dot[At[la,lb]] -> - su[uc] PDo[At[la,lb],lc],
+ dot[Xt[ua]] -> - su[uc] PDo[Xt[ua] ,lc],
+ (*dot[alpha] -> - vg su[uc] PDo[alpha ,lc],
+ dot[A] -> - vg su[uc] PDo[A ,lc],*)
+ dot[beta[ua]] -> - su[uc] PDo[beta[ua] ,lc],
+ dot[B[ua]] -> - su[uc] PDo[B[ua] ,lc],
+ IfCCZ4[
+ dot[Theta] -> - vg su[uc] PDo[Theta ,lc]
+ ]
+ }
+};
+
+enforceCalc =
+{
+ Name -> thorn <> "_enforce",
+ Schedule -> {"IN MoL_PostStepModify"},
+ Shorthands -> {detgt, gtu[ua,ub], trAt},
+ Equations ->
+ {
+ (* The following comment is still interesting, but is not correct
+ any more since it is now scheduled in MoL_PostStepModify instead:
+
+ Enforcing the constraints needs to be a projection, because it
+ is applied in MoL_PostStep and may thus be applied multiple
+ times, not only during time evolution. Therefore detgt has to
+ be calculated correctly, without assuming that det gt_ij = 1,
+ which is not always the case (since we don't enforce it). On
+ the other hand, this may not be so important... *)
+ detgt -> 1 (* detgtExpr *),
+ gtu[ua,ub] -> 1/detgt detgtExpr MatrixInverse [gt[ua,ub]],
+
+ trAt -> gtu[ua,ub] At[la,lb],
+
+ At[la,lb] -> At[la,lb] - (1/3) gt[la,lb] trAt(*,
+
+ alpha -> Max[alpha, MinimumLapse]*)
+ }
+};
+
+(******************************************************************************)
+(* Boundary conditions *)
+(******************************************************************************)
+
+boundaryCalc =
+{
+ Name -> thorn <> "_boundary",
+ Schedule -> {"IN MoL_PostStep"},
+ ConditionalOnKeyword -> {"my_boundary_condition", "Minkowski"},
+ Where -> BoundaryWithGhosts,
+ Equations ->
+ {
+ phi -> IfThen[conformalMethod==CMW, 1, 0],
+ gt[la,lb] -> KD[la,lb],
+ trK -> 0,
+ At[la,lb] -> 0,
+ Xt[ua] -> 0,
+ (*alpha -> 1,
+ A -> 0,*)
+ beta[ua] -> 0,
+ B[ua] -> 0,
+ IfCCZ4[Theta -> 0]
+ }
+};
+
+(******************************************************************************)
+(* Constraint equations *)
+(******************************************************************************)
+
+constraintsCalc =
+{
+ Name -> thorn <> "_constraints",
+ Schedule -> Automatic,
+ After -> "MoL_PostStep",
+ Where -> Interior,
+ Shorthands -> {detgt, ddetgt[la], gtu[ua,ub], Z[ua],
+ Gt[ua,lb,lc], Gtl[la,lb,lc], Gtlu[la,lb,uc], Xtn[ua],
+ e4phi, em4phi,
+ g[la,lb], detg, gu[ua,ub], ddetg[la], G[ua,lb,lc],
+ Rt[la,lb], Rphi[la,lb], R[la,lb], trR, Atm[ua,lb],
+ gK[la,lb,lc], cdphi[la], cdphi2[la,lb],
+ rho, S[la], fac1, fac2},
+ Equations ->
+ {
+ detgt -> 1 (* detgtExpr *),
+ ddetgt[la] -> 0 (* ddetgtExpr[la] *),
+
+ (* This leads to simpler code... *)
+ gtu[ua,ub] -> 1/detgt detgtExpr MatrixInverse [gt[ua,ub]],
+ Gtl[la,lb,lc] -> 1/2
+ (PD[gt[lb,la],lc] + PD[gt[lc,la],lb] - PD[gt[lb,lc],la]),
+ Gtlu[la,lb,uc] -> gtu[uc,ud] Gtl[la,lb,ld],
+ Gt[ua,lb,lc] -> gtu[ua,ud] Gtl[ld,lb,lc],
+
+ (* The conformal connection functions calculated from the conformal metric,
+ used instead of Xt where no derivatives of Xt are taken *)
+ Xtn[ui] -> gtu[uj,uk] Gt[ui,lj,lk],
+
+ e4phi -> IfThen[conformalMethod==CMW, 1/phi^2, Exp[4 phi]],
+ em4phi -> 1 / e4phi,
+ g[la,lb] -> e4phi gt[la,lb],
+ detg -> e4phi^3,
+ gu[ua,ub] -> em4phi gtu[ua,ub],
+
+ (* The Z quantities *)
+ IfCCZ4[
+ Z[ud] -> (1/2) gu[ua,ud] (- PD[gt[la,lb],lc] gtu[ub,uc] + gt[la,lc] Xt[uc])
+ ],
+
+ (* PRD 62, 044034 (2000), eqn. (18) *)
+ Rt[li,lj] -> - (1/2) gtu[ul,um] PD[gt[li,lj],ll,lm]
+ + (1/2) gt[lk,li] PD[Xt[uk],lj]
+ + (1/2) gt[lk,lj] PD[Xt[uk],li]
+ + (1/2) Xtn[uk] Gtl[li,lj,lk]
+ + (1/2) Xtn[uk] Gtl[lj,li,lk]
+ + (+ Gt[uk,li,ll] Gtlu[lj,lk,ul]
+ + Gt[uk,lj,ll] Gtlu[li,lk,ul]
+ + Gt[uk,li,ll] Gtlu[lk,lj,ul]),
+
+ (* From the long turducken paper.
+ This expression seems to give the same result as the one from 044034. *)
+ (* TODO: symmetrise correctly: (ij) = (1/2) [i+j] *)
+(*
+ Rt[li,lj] -> - (1/2) gtu[uk,ul] PD[gt[li,lj],lk,ll]
+ + gt[lk,li] PD[Xt[uk],lj] + gt[lk,lj] PD[Xt[uk],li]
+ + gt[li,ln] Gt[un,lj,lk] gtu[um,ua] gtu[uk,ub] PD[gt[la,lb],lm]
+ + gt[lj,ln] Gt[un,li,lk] gtu[um,ua] gtu[uk,ub] PD[gt[la,lb],lm]
+ + gtu[ul,us] (+ 2 Gt[uk,ll,li] gt[lj,ln] Gt[un,lk,ls]
+ + 2 Gt[uk,ll,lj] gt[li,ln] Gt[un,lk,ls]
+ + Gt[uk,li,ls] gt[lk,ln] Gt[un,ll,lj]),
+*)
+
+ (* Below would be a straightforward calculation,
+ without taking any Gamma^i into account.
+ This expression gives a different answer! *)
+(*
+ Rt[la,lb] -> + Gt[u1,l2,la] Gt[l1,lb,u2] - Gt[u1,la,lb] Gt[l1,l2,u2]
+ + 1/2 gtu[u1,u2] (- PD[gt[l1,l2],la,lb] + PD[gt[l1,la],l2,lb]
+ - PD[gt[la,lb],l1,l2] + PD[gt[l2,lb],l1,la]),
+*)
+
+ fac1 -> IfThen[conformalMethod==CMW, -1/(2 phi), 1],
+ cdphi[la] -> fac1 CDt[phi,la],
+ fac2 -> IfThen[conformalMethod==CMW, 1/(2 phi^2), 0],
+ cdphi2[la,lb] -> fac1 CDt[phi,la,lb] + fac2 CDt[phi,la] CDt[phi,lb],
+
+ (* PRD 62, 044034 (2000), eqn. (15) *)
+ Rphi[li,lj] -> - 2 cdphi2[lj,li]
+ - 2 gt[li,lj] gtu[ul,un] cdphi2[ll,ln]
+ + 4 cdphi[li] cdphi[lj]
+ - 4 gt[li,lj] gtu[ul,un] cdphi[ln] cdphi[ll],
+
+ (* ddetg[la] -> PD[e4phi detg,la], *)
+ ddetg[la] -> e4phi ddetgt[la] + 4 detgt e4phi PD[phi,la],
+ (* TODO: check this equation, maybe simplify it by omitting ddetg *)
+ G[ua,lb,lc] -> Gt[ua,lb,lc]
+ + 1/(2 detg) (+ KD[ua,lb] ddetg[lc] + KD[ua,lc] ddetg[lb]
+ - (1/3) g[lb,lc] gu[ua,ud] ddetg[ld]),
+
+ R[la,lb] -> + Rt[la,lb] + Rphi[la,lb],
+
+ IfCCZ4[
+ R[la,lb] -> R[la, lb] + (2/phi) (+ g[la,lc] Z[uc] PD[phi,lb]
+ + g[lb,lc] Z[uc] PD[phi,la] - g[la,lb] Z[uc] PD[phi,lc])
+ + e4phi Z[uc] PD[gt[la,lb],lc]
+ ],
+
+ trR -> gu[ua,ub] R[la,lb],
+
+ (* K[la,lb] -> e4phi At[la,lb] + (1/3) g[la,lb] trK, *)
+ (* Km[ua,lb] -> gu[ua,uc] K[lc,lb], *)
+ Atm[ua,lb] -> gtu[ua,uc] At[lc,lb],
+
+ (* Matter terms *)
+
+ (* rho = n^a n^b T_ab *)
+ rho -> 1/alpha^2 (T00 - 2 beta[ui] T0[li] + beta[ui] beta[uj] T[li,lj]),
+
+ (* S_i = -p^a_i n^b T_ab, where p^a_i = delta^a_i + n^a n_i *)
+ S[li] -> -1/alpha (T0[li] - beta[uj] T[li,lj]),
+
+ (* Constraints *)
+
+ (* H -> trR - Km[ua,lb] Km[ub,la] + trK^2, *)
+ (* PRD 67, 084023 (2003), eqn. (19) *)
+ H -> trR - Atm[ua,lb] Atm[ub,la] + (2/3) trK^2 - addMatter 16 Pi rho,
+
+ (* gK[la,lb,lc] -> CD[K[la,lb],lc], *)
+(* gK[la,lb,lc] -> + 4 e4phi PD[phi,lc] At[la,lb] + e4phi CD[At[la,lb],lc]
+ + (1/3) g[la,lb] PD[trK,lc],
+
+ M[la] -> gu[ub,uc] (gK[lc,la,lb] - gK[lc,lb,la]), *)
+
+ M[li] -> + gtu[uj,uk] (CDt[At[li,lj],lk] + 6 At[li,lj] cdphi[lk])
+ - (2/3) PD[trK,li]
+ - addMatter 8 Pi S[li],
+ (* TODO: use PRD 67, 084023 (2003), eqn. (20) *)
+
+ (* det gamma-tilde *)
+ cS -> Log[detgt],
+
+ (* Gamma constraint *)
+ cXt[ua] -> gtu[ub,uc] Gt[ua,lb,lc] - Xt[ua],
+
+ (* trace A-tilde *)
+ cA -> gtu[ua,ub] At[la,lb]
+ }
+};
+
+constraintsCalc1 = PartialCalculation[constraintsCalc, "1",
+ {},
+ {
+ H
+ }];
+
+constraintsCalc2 = PartialCalculation[constraintsCalc, "2",
+ {},
+ {
+ M[li],
+ cS,
+ cXt[ua],
+ cA
+ }];
+
+(******************************************************************************)
+(* Implementations *)
+(******************************************************************************)
+
+inheritedImplementations =
+ Join[{"ADMBase", "QuasiMaximalSlicing"},
+ If [addMatter!=0, {"TmunuBase"}, {}]];
+
+(******************************************************************************)
+(* Parameters *)
+(******************************************************************************)
+
+inheritedKeywordParameters = {};
+
+extendedKeywordParameters =
+{
+ {
+ Name -> "ADMBase::evolution_method",
+ AllowedValues -> {thorn}
+ },
+ {
+ Name -> "ADMBase::lapse_evolution_method",
+ AllowedValues -> {thorn}
+ },
+ {
+ Name -> "ADMBase::shift_evolution_method",
+ AllowedValues -> {thorn}
+ },
+ {
+ Name -> "ADMBase::dtlapse_evolution_method",
+ AllowedValues -> {thorn}
+ },
+ {
+ Name -> "ADMBase::dtshift_evolution_method",
+ AllowedValues -> {thorn}
+ }
+};
+
+keywordParameters =
+{
+ {
+ Name -> "my_initial_data",
+ (* Visibility -> "restricted", *)
+ (* Description -> "ddd", *)
+ AllowedValues -> {"ADMBase", "Minkowski"},
+ Default -> "ADMBase"
+ },
+ {
+ Name -> "my_initial_boundary_condition",
+ Visibility -> "restricted",
+ (* Description -> "ddd", *)
+ AllowedValues -> {"none"},
+ Default -> "none"
+ },
+ {
+ Name -> "my_rhs_boundary_condition",
+ Visibility -> "restricted",
+ (* Description -> "ddd", *)
+ AllowedValues -> {"none", "static", "radiative"},
+ Default -> "none"
+ },
+ {
+ Name -> "my_boundary_condition",
+ (* Visibility -> "restricted", *)
+ (* Description -> "ddd", *)
+ AllowedValues -> {"none", "Minkowski"},
+ Default -> "none"
+ },
+ {
+ Name -> "calculate_ADMBase_variables_at",
+ Visibility -> "restricted",
+ (* Description -> "ddd", *)
+ AllowedValues -> {"MoL_PostStep", "CCTK_EVOL", "CCTK_ANALYSIS"},
+ Default -> "MoL_PostStep"
+ },
+ {
+ Name -> "UseSpatialBetaDriver",
+ Visibility -> "restricted",
+ (* Description -> "ddd", *)
+ AllowedValues -> {"no", "yes"},
+ Default -> "no"
+ },
+ {
+ Name -> "dt_lapse_shift_method",
+ Description -> "Treatment of ADMBase dtlapse and dtshift",
+ AllowedValues -> {"correct",
+ "noLapseShiftAdvection" (* omit lapse and shift advection terms (faster) *)
+ },
+ Default -> "correct"
+ },
+ {
+ Name -> "apply_dissipation",
+ Description -> "Whether to apply dissipation to the RHSs",
+ AllowedValues -> {"always",
+ "never" (* yes and no keyword values confuse Cactus, and Kranc
+ doesn't support boolean parameters *)
+ },
+ Default -> "never"
+ }
+
+};
+
+intParameters =
+{
+ {
+ Name -> harmonicN,
+ Description -> "d/dt alpha = - f alpha^n K (harmonic=2, 1+log=1)",
+ Default -> 2
+ },
+ {
+ Name -> ShiftAlphaPower,
+ Default -> 0
+ },
+ {
+ Name -> conformalMethod,
+ Description -> "Treatment of conformal factor",
+ AllowedValues -> {{Value -> "0", Description -> "phi method"},
+ {Value -> "1", Description -> "W method"}},
+ Default -> 0
+ },
+ {
+ Name -> fdOrder,
+ Default -> derivOrder,
+ AllowedValues -> {2,4,6,8}
+ },
+ {
+ Name -> harmonicShift,
+ Description -> "Whether to use the harmonic shift",
+ AllowedValues -> {{Value -> "0", Description -> "Gamma driver shift"},
+ {Value -> "1", Description -> "Harmonic shift"}},
+ Default -> 0
+ }
+};
+
+realParameters =
+{
+ IfCCZ4[{
+ Name -> GammaShift,
+ Description -> "Covariant shift term in Gamma",
+ Default -> 0.5
+ }],
+ IfCCZ4[{
+ Name -> dampk1,
+ Description -> "CCZ4 damping term 1 for Theta and Z",
+ Default -> 0
+ }],
+ IfCCZ4[{
+ Name -> dampk2,
+ Description -> "CCZ4 damping term 2 for Theta and Z",
+ Default -> 0
+ }],
+ {
+ Name -> LapseACoeff,
+ Description -> "Whether to evolve A in time",
+ Default -> 0
+ },
+ {
+ Name -> harmonicF,
+ Description -> "d/dt alpha = - f alpha^n K (harmonic=1, 1+log=2)",
+ Default -> 1
+ },
+ {
+ Name -> AlphaDriver,
+ Default -> 0
+ },
+ {
+ Name -> RDriver,
+ Default -> 1
+ },
+ {
+ Name -> ShiftBCoeff,
+ Description -> "Whether to evolve B^i in time",
+ Default -> 1
+ },
+ {
+ Name -> ShiftGammaCoeff,
+ Default -> 0
+ },
+ {
+ Name -> BetaDriver,
+ Default -> 0
+ },
+ {
+ Name -> WFactor,
+ Default -> 1
+ },
+ {
+ Name -> LapseAdvectionCoeff,
+ Description -> "Factor in front of the lapse advection terms in 1+log",
+ Default -> 1
+ },
+ {
+ Name -> ShiftAdvectionCoeff,
+ Description -> "Factor in front of the shift advection terms in gamma driver",
+ Default -> 1
+ },
+ {
+ Name -> MinimumLapse,
+ Description -> "Minimum value of the lapse function",
+ Default -> -1
+ },
+ {
+ Name -> SpatialBetaDriverRadius,
+ Description -> "Radius at which the BetaDriver starts to be reduced",
+ AllowedValues -> {{Value -> "(0:*", Description -> "Positive"}},
+ Default -> 10^12
+ },
+ {
+ Name -> SpatialShiftGammaCoeffRadius,
+ Description -> "Radius at which the ShiftGammaCoefficient starts to be reduced",
+ AllowedValues -> {{Value -> "(0:*", Description -> "Positive"}},
+ Default -> 10^12
+ },
+ {
+ Name -> EpsDiss,
+ Description -> "Dissipation strength",
+ AllowedValues -> {{Value -> "(0:*", Description -> "Positive"}},
+ Default -> 0
+ },
+ {
+ Name -> KEvolFactor,
+ Description -> "",
+ AllowedValues -> {{Value -> "(0:*", Description -> "Positive"}},
+ Default -> 0
+ }
+};
+
+(******************************************************************************)
+(* Construct the thorns *)
+(******************************************************************************)
+
+calculations =
+Join[
+{
+ initialCalc,
+ convertFromADMBaseCalc,
+ initGammaCalc,
+ convertFromADMBaseGammaCalc,
+ evolCalc,
+ (*evolCalc1, evolCalc2,*)
+ lapseEvolCalc,
+ dissCalc,
+ advectCalc,
+ initRHSCalc,
+ (* evol1Calc, evol2Calc, *)
+ RHSStaticBoundaryCalc,
+ (* RHSRadiativeBoundaryCalc, *)
+ enforceCalc,
+ boundaryCalc,
+ convertToADMBaseCalc,
+ convertToADMBaseDtLapseShiftCalc,
+ convertToADMBaseDtLapseShiftBoundaryCalc,
+ convertToADMBaseFakeDtLapseShiftCalc,
+ constraintsCalc
+ (*constraintsCalc1, constraintsCalc2*)
+},
+ {} (*dissCalcs*)
+];
+
+CreateKrancThornTT [groups, ".", thorn,
+ Calculations -> calculations,
+ DeclaredGroups -> declaredGroupNames,
+ PartialDerivatives -> derivatives,
+ EvolutionTimelevels -> evolutionTimelevels,
+ DefaultEvolutionTimelevels -> 3,
+ UseJacobian -> True,
+ UseLoopControl -> True,
+ UseVectors -> True,
+ InheritedImplementations -> inheritedImplementations,
+ InheritedKeywordParameters -> inheritedKeywordParameters,
+ ExtendedKeywordParameters -> extendedKeywordParameters,
+ KeywordParameters -> keywordParameters,
+ IntParameters -> intParameters,
+ RealParameters -> realParameters
+];
+
+];
+
+
+
+(******************************************************************************)
+(* Options *)
+(******************************************************************************)
+
+(* These are the arguments to createCode:
+ - derivative order: 2, 4, 6, 8, ...
+ - useJacobian: False or True
+ - split upwind derivatives: False or True
+ - timelevels: 2 or 3
+ (keep this at 3; this is better chosen with a run-time parameter)
+ - matter: 0 or 1
+ (matter seems cheap; it should be always enabled)
+ - thorn base name
+*)
+
+createCode[4, False, True , 3, 1, "MD"];
diff --git a/src/basis.c b/src/basis.c
new file mode 100644
index 0000000..8e5bdcc
--- /dev/null
+++ b/src/basis.c
@@ -0,0 +1,281 @@
+/*
+ * Basis sets for pseudospectral methods
+ * Copyright (C) 2016 Anton Khirnov <anton@khirnov.net>
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <errno.h>
+#include <math.h>
+
+#include "basis.h"
+#include "common.h"
+
+typedef struct BasisSet {
+ /* evaluate the idx-th basis function at the specified point*/
+ double (*eval) (const MDBasisSetContext *s, double coord, unsigned int idx);
+ /* evaluate the first derivative of the idx-th basis function at the specified point*/
+ double (*eval_diff1)(const MDBasisSetContext *s, double coord, unsigned int idx);
+ /* evaluate the second derivative of the idx-th basis function at the specified point*/
+ double (*eval_diff2)(const MDBasisSetContext *s, double coord, unsigned int idx);
+ /**
+ * Get the idx-th collocation point for the specified order.
+ * idx runs from 0 to order - 1 (inclusive)
+ */
+ double (*colloc_point)(const MDBasisSetContext *s, unsigned int order, unsigned int idx);
+} BasisSet;
+
+struct MDBasisSetContext {
+ const BasisSet *bs;
+ double sf;
+};
+
+/*
+ * The basis of even (n = 2 * idx) SB functions (Boyd 2000, Ch 17.9)
+ * SB(x, n) = sin((n + 1) arccot(|x| / L))
+ * They are symmetric wrt origin and decay as 1/x in infinity.
+ */
+static double sb_even_eval(const MDBasisSetContext *s, double coord, unsigned int idx)
+{
+ double val = atan2(s->sf, coord);
+
+ idx *= 2; // even only
+
+ return sin((idx + 1) * val);
+}
+
+static double sb_even_eval_diff1(const MDBasisSetContext *s, double coord, unsigned int idx)
+{
+ double val = atan2(s->sf, coord);
+
+ idx *= 2; // even only
+
+ return -s->sf * (idx + 1) * cos((idx + 1) * val) / (SQR(s->sf) + SQR(coord));
+}
+
+static double sb_even_eval_diff2(const MDBasisSetContext *s, double coord, unsigned int idx)
+{
+ const double sf = s->sf;
+ double val = atan2(sf, coord);
+
+ idx *= 2; // even only
+
+ return sf * (idx + 1) * (2 * coord * cos((idx + 1) * val) - sf * (idx + 1) * sin((idx + 1) * val)) / SQR(SQR(sf) + SQR(coord));
+}
+
+static double sb_even_colloc_point(const MDBasisSetContext *s, unsigned int order, unsigned int idx)
+{
+ double t;
+
+ idx = order - idx - 1;
+ //order *= 2;
+
+ //t = (idx + 2) * M_PI / (order + 4);
+#if MD_POLAR
+ t = (idx + 2) * M_PI / (2 * order + 3);
+#else
+ t = (idx + 2) * M_PI / (2 * order + 2);
+#endif
+ return s->sf / tan(t);
+}
+
+static const BasisSet sb_even_basis = {
+ .eval = sb_even_eval,
+ .eval_diff1 = sb_even_eval_diff1,
+ .eval_diff2 = sb_even_eval_diff2,
+ .colloc_point = sb_even_colloc_point,
+};
+
+static double sb_odd_eval(const MDBasisSetContext *s, double coord, unsigned int idx)
+{
+ double val = atan2(s->sf, coord);
+
+ idx = 2 * idx + 2; // odd only
+
+ return sin((idx) * val);
+}
+
+static double sb_odd_eval_diff1(const MDBasisSetContext *s, double coord, unsigned int idx)
+{
+ double val = atan2(s->sf, coord);
+
+ idx = 2 * idx + 2; // odd only
+
+ return -s->sf * (idx) * cos((idx) * val) / (SQR(s->sf) + SQR(coord));
+}
+
+static double sb_odd_eval_diff2(const MDBasisSetContext *s, double coord, unsigned int idx)
+{
+ const double sf = s->sf;
+ double val = atan2(sf, coord);
+
+ idx = 2 * idx + 2; // odd only
+
+ return sf * (idx) * (2 * coord * cos((idx) * val) - sf * (idx) * sin((idx) * val)) / SQR(SQR(sf) + SQR(coord));
+}
+
+static double sb_odd_colloc_point(const MDBasisSetContext *s, unsigned int order, unsigned int idx)
+{
+ double t;
+
+ idx = order - idx - 1;
+ //order *= 2;
+
+ //t = (idx + 2) * M_PI / (order + 4);
+#if MD_POLAR
+ t = (idx + 2) * M_PI / (2 * order + 3);
+#else
+ t = (idx + 2) * M_PI / (2 * order + 3);
+#endif
+ return s->sf / tan(t);
+}
+
+static const BasisSet sb_odd_basis = {
+ .eval = sb_odd_eval,
+ .eval_diff1 = sb_odd_eval_diff1,
+ .eval_diff2 = sb_odd_eval_diff2,
+ .colloc_point = sb_odd_colloc_point,
+};
+
+static double tb_even_eval(const MDBasisSetContext *s, double coord, unsigned int idx)
+{
+ double val = (coord == 0.0) ? M_PI_2 : atan(s->sf / fabs(coord));
+
+ idx++;
+ idx *= 2; // even only
+
+ return cos(idx * val) - 1.0;
+}
+
+static double tb_even_eval_diff1(const MDBasisSetContext *s, double coord, unsigned int idx)
+{
+ double val = (coord == 0.0) ? M_PI_2 : atan(s->sf / fabs(coord));
+
+ idx++;
+ idx *= 2; // even only
+
+ return s->sf * idx * SGN(coord) * sin(idx * val) / (SQR(s->sf) + SQR(coord));
+}
+
+static double tb_even_eval_diff2(const MDBasisSetContext *s, double coord, unsigned int idx)
+{
+ const double sf = s->sf;
+ double val = (coord == 0.0) ? M_PI_2 : atan(sf / fabs(coord));
+
+ idx++;
+ idx *= 2; // even only
+
+ return -sf * idx * SGN(coord) * (2 * fabs(coord) * sin(idx * val) + sf * idx * cos(idx * val)) / SQR(SQR(sf) + SQR(coord));
+}
+
+static double tb_even_colloc_point(const MDBasisSetContext *s, unsigned int order, unsigned int idx)
+{
+ double t;
+
+ idx = order - idx - 1;
+ //order *= 2;
+
+ //t = (idx + 2) * M_PI / (order + 4);
+ t = (idx + 2) * M_PI / (2 * order + 4);
+ return s->sf / tan(t);
+}
+
+static const BasisSet tb_even_basis = {
+ .eval = tb_even_eval,
+ .eval_diff1 = tb_even_eval_diff1,
+ .eval_diff2 = tb_even_eval_diff2,
+ .colloc_point = tb_even_colloc_point,
+};
+
+static double cos_even_eval(const MDBasisSetContext *s, double coord, unsigned int idx)
+{
+ return cos(2 * idx * coord);
+}
+
+static double cos_even_eval_diff1(const MDBasisSetContext *s, double coord, unsigned int idx)
+{
+ return -2 * idx * sin(2 * idx * coord);
+}
+
+static double cos_even_eval_diff2(const MDBasisSetContext *s, double coord, unsigned int idx)
+{
+ return -4 * SQR(idx) * cos(2 * idx * coord);
+}
+
+static double cos_even_colloc_point(const MDBasisSetContext *s, unsigned int order, unsigned int idx)
+{
+ return M_PI * idx / (2 * order - 0);
+}
+
+static const BasisSet cos_even_basis = {
+ .eval = cos_even_eval,
+ .eval_diff1 = cos_even_eval_diff1,
+ .eval_diff2 = cos_even_eval_diff2,
+ .colloc_point = cos_even_colloc_point,
+};
+
+double md_basis_eval(const MDBasisSetContext *s, enum MDBasisEvalType type,
+ double coord, unsigned int order)
+{
+ double (*eval)(const MDBasisSetContext *, double, unsigned int) = NULL;
+
+ switch (type) {
+ case MD_BASIS_EVAL_TYPE_VALUE: eval = s->bs->eval; break;
+ case MD_BASIS_EVAL_TYPE_DIFF1: eval = s->bs->eval_diff1; break;
+ case MD_BASIS_EVAL_TYPE_DIFF2: eval = s->bs->eval_diff2; break;
+ }
+
+ return eval(s, coord, order);
+}
+
+double md_basis_colloc_point(const MDBasisSetContext *s, unsigned int order,
+ unsigned int idx)
+{
+ return s->bs->colloc_point(s, order, idx);
+}
+
+void md_basis_free(MDBasisSetContext **pctx)
+{
+ MDBasisSetContext *ctx = *pctx;
+
+ if (!ctx)
+ return;
+
+ free(ctx);
+ *pctx = NULL;
+}
+
+int md_basis_init(MDBasisSetContext **pctx, enum MDBasisFamily family, double sf)
+{
+ MDBasisSetContext *ctx;
+
+ ctx = calloc(1, sizeof(*ctx));
+ if (!ctx)
+ return -ENOMEM;
+
+ switch (family) {
+ case MD_BASIS_FAMILY_TB_EVEN: ctx->bs = &tb_even_basis; break;
+ case MD_BASIS_FAMILY_SB_EVEN: ctx->bs = &sb_even_basis; break;
+ case MD_BASIS_FAMILY_SB_ODD: ctx->bs = &sb_odd_basis; break;
+ case MD_BASIS_FAMILY_COS_EVEN: ctx->bs = &cos_even_basis; break;
+ default:
+ free(ctx);
+ return -EINVAL;
+ }
+
+ ctx->sf = sf;
+
+ *pctx = ctx;
+ return 0;
+}
diff --git a/src/basis.h b/src/basis.h
new file mode 100644
index 0000000..08f23ee
--- /dev/null
+++ b/src/basis.h
@@ -0,0 +1,45 @@
+/*
+ * Basis sets for pseudospectral methods
+ * Copyright (C) 2016 Anton Khirnov <anton@khirnov.net>
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef MD_BASIS_H
+#define MD_BASIS_H
+
+enum MDBasisEvalType {
+ MD_BASIS_EVAL_TYPE_VALUE,
+ MD_BASIS_EVAL_TYPE_DIFF1,
+ MD_BASIS_EVAL_TYPE_DIFF2,
+};
+
+enum MDBasisFamily {
+ MD_BASIS_FAMILY_TB_EVEN,
+ MD_BASIS_FAMILY_SB_EVEN,
+ MD_BASIS_FAMILY_SB_ODD,
+ MD_BASIS_FAMILY_COS_EVEN,
+};
+
+typedef struct MDBasisSetContext MDBasisSetContext;
+
+int md_basis_init(MDBasisSetContext **ctx, enum MDBasisFamily family, double sf);
+void md_basis_free(MDBasisSetContext **ctx);
+
+double md_basis_eval(const MDBasisSetContext *ctx, enum MDBasisEvalType type,
+ double coord, unsigned int order);
+double md_basis_colloc_point(const MDBasisSetContext *ctx, unsigned int order,
+ unsigned int idx);
+
+#endif /* MD_BASIS_H */
diff --git a/src/bicgstab.c b/src/bicgstab.c
new file mode 100644
index 0000000..7e82183
--- /dev/null
+++ b/src/bicgstab.c
@@ -0,0 +1,410 @@
+/*
+ * BiCGStab iterative linear system solver
+ * Copyright (C) 2016 Anton Khirnov <anton@khirnov.net>
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "common.h"
+
+#if HAVE_OPENCL
+#include <cl.h>
+#include <clBLAS.h>
+#endif
+
+#include <cblas.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "bicgstab.h"
+
+#define BICGSTAB_MAXITER 16
+#define BICGSTAB_TOL (1e-15)
+
+struct BiCGStabContext {
+ int N;
+
+ double *x;
+ double *p, *v, *y, *z, *t;
+ double *res, *res0;
+ double *k;
+
+#if HAVE_OPENCL
+ cl_context ocl_ctx;
+ cl_command_queue ocl_queue;
+
+ cl_mem cl_x;
+ cl_mem cl_p, cl_v, cl_y, cl_z, cl_t;
+ cl_mem cl_res, cl_res0;
+ cl_mem cl_k, cl_mat;
+ cl_mem cl_rho, cl_alpha, cl_beta, cl_omega, cl_omega1;
+ cl_mem cl_tmp, cl_tmp1;
+#endif
+};
+
+#if HAVE_OPENCL
+static int solve_cl(BiCGStabContext *ctx,
+ const double *mat, const double *rhs, double *x)
+{
+ cl_command_queue ocl_q = ctx->ocl_queue;
+ const int N = ctx->N;
+ const double rhs_norm = cblas_dnrm2(N, rhs, 1);
+
+ double rho, rho_prev = 1.0;
+ double omega[2] = { 1.0 };
+ double alpha = 1.0;
+
+ double err;
+ int i;
+
+ cl_event events[8];
+
+ // upload the matrix and RHS
+ clEnqueueWriteBuffer(ocl_q, ctx->cl_res, 0, 0, N * sizeof(double), rhs, 0, NULL, &events[0]);
+ clEnqueueWriteBuffer(ocl_q, ctx->cl_mat, 0, 0, N * N * sizeof(double), mat, 0, NULL, &events[1]);
+
+ // initialize the residual
+ clblasDgemv(CblasColMajor, CblasNoTrans, N, N, -1.0,
+ ctx->cl_mat, 0, N, ctx->cl_x, 0, 1, 1.0, ctx->cl_res, 0, 1,
+ 1, &ocl_q, 2, events, &events[2]);
+ clEnqueueCopyBuffer(ocl_q, ctx->cl_res, ctx->cl_res0, 0, 0, N * sizeof(double),
+ 1, &events[2], &events[3]);
+ clEnqueueCopyBuffer(ocl_q, ctx->cl_res, ctx->cl_p, 0, 0, N * sizeof(double),
+ 1, &events[2], &events[4]);
+
+ clWaitForEvents(5, events);
+ // BARRIER
+
+ for (i = 0; i < MAXITER; i++) {
+ clblasDdot(N, ctx->cl_rho, 0, ctx->cl_res, 0, 1, ctx->cl_res0, 0, 1,
+ ctx->cl_tmp, 1, &ocl_q, 0, NULL, &events[0]);
+ clEnqueueReadBuffer(ocl_q, ctx->cl_rho, 1, 0, sizeof(double), &rho,
+ 1, &events[0], NULL);
+ // BARRIER
+
+ if (i) {
+ double beta = (rho / rho_prev) * (alpha / omega[0]);
+
+ clblasDaxpy(N, -omega[0], ctx->cl_v, 0, 1, ctx->cl_p, 0, 1,
+ 1, &ocl_q, 0, NULL, &events[0]);
+ clblasDscal(N, beta, ctx->cl_p, 0, 1,
+ 1, &ocl_q, 1, &events[0], &events[1]);
+ clblasDaxpy(N, 1, ctx->cl_res, 0, 1, ctx->cl_p, 0, 1,
+ 1, &ocl_q, 1, &events[1], &events[0]);
+ clWaitForEvents(1, &events[0]);
+ // BARRIER
+ }
+
+ clblasDgemv(CblasColMajor, CblasNoTrans, N, N, 1.0,
+ ctx->cl_k, 0, N, ctx->cl_p, 0, 1, 0.0, ctx->cl_y, 0, 1,
+ 1, &ocl_q, 0, NULL, &events[0]);
+
+ clblasDgemv(CblasColMajor, CblasNoTrans, N, N, 1.0,
+ ctx->cl_mat, 0, N, ctx->cl_y, 0, 1, 0.0, ctx->cl_v, 0, 1,
+ 1, &ocl_q, 1, &events[0], &events[1]);
+
+ clblasDdot(N, ctx->cl_alpha, 0, ctx->cl_res0, 0, 1, ctx->cl_v, 0, 1,
+ ctx->cl_tmp, 1, &ocl_q, 1, &events[1], &events[0]);
+ clEnqueueReadBuffer(ocl_q, ctx->cl_alpha, 1, 0, sizeof(double), &alpha,
+ 1, &events[0], NULL);
+ // BARRIER
+
+ alpha = rho / alpha;
+
+ clblasDaxpy(N, -alpha, ctx->cl_v, 0, 1, ctx->cl_res, 0, 1,
+ 1, &ocl_q, 0, NULL, &events[0]);
+
+ clblasDgemv(CblasColMajor, CblasNoTrans, N, N, 1.0,
+ ctx->cl_k, 0, N, ctx->cl_res, 0, 1, 0.0, ctx->cl_z, 0, 1,
+ 1, &ocl_q, 1, &events[0], &events[1]);
+ clblasDgemv(CblasColMajor, CblasNoTrans, N, N, 1.0,
+ ctx->cl_mat, 0, N, ctx->cl_z, 0, 1, 0.0, ctx->cl_t, 0, 1,
+ 1, &ocl_q, 1, &events[1], &events[0]);
+
+ clblasDdot(N, ctx->cl_omega, 0, ctx->cl_t, 0, 1, ctx->cl_res, 0, 1,
+ ctx->cl_tmp, 1, &ocl_q, 1, &events[0], &events[1]);
+ clblasDdot(N, ctx->cl_omega, 1, ctx->cl_t, 0, 1, ctx->cl_t, 0, 1,
+ ctx->cl_tmp1, 1, &ocl_q, 1, &events[0], &events[2]);
+
+ clEnqueueReadBuffer(ocl_q, ctx->cl_omega, 1, 0, sizeof(omega), omega,
+ 2, &events[1], NULL);
+ // BARRIER
+
+ omega[0] /= omega[1];
+
+ clblasDaxpy(N, alpha, ctx->cl_y, 0, 1, ctx->cl_x, 0, 1,
+ 1, &ocl_q, 0, NULL, &events[0]);
+ clblasDaxpy(N, omega[0], ctx->cl_z, 0, 1, ctx->cl_x, 0, 1,
+ 1, &ocl_q, 1, &events[0], &events[1]);
+
+ clblasDaxpy(N, -omega[0], ctx->cl_t, 0, 1, ctx->cl_res, 0, 1,
+ 1, &ocl_q, 0, NULL, &events[0]);
+ clblasDnrm2(N, ctx->cl_tmp, 0, ctx->cl_res, 0, 1, ctx->cl_tmp1,
+ 1, &ocl_q, 1, &events[0], &events[2]);
+ clEnqueueReadBuffer(ocl_q, ctx->cl_tmp, 1, 0, sizeof(double), &err,
+ 1, &events[2], NULL);
+ clWaitForEvents(1, &events[1]);
+ // BARRIER
+
+ if (err < BICGSTAB_TOL)
+ break;
+
+ rho_prev = rho;
+ }
+ if (i == BICGSTAB_MAXITER)
+ return -1;
+
+ clEnqueueReadBuffer(ocl_q, ctx->cl_x, 1, 0, sizeof(double) * N,
+ x, 0, NULL, NULL);
+ return i;
+}
+#endif
+
+// based on the wikipedia article
+// and http://www.netlib.org/templates/matlab/bicgstab.m
+static int solve_sw(BiCGStabContext *ctx,
+ const double *mat, const double *rhs, double *x)
+{
+ const int N = ctx->N;
+ const double rhs_norm = cblas_dnrm2(N, rhs, 1);
+
+ double rho, rho_prev = 1.0;
+ double omega = 1.0;
+ double alpha = 1.0;
+
+ double err;
+ int i;
+
+ double *k = ctx->k;
+ double *p = ctx->p, *v = ctx->v, *y = ctx->y, *z = ctx->z, *t = ctx->t;
+ double *res = ctx->res, *res0 = ctx->res0;
+
+ // initialize the residual
+ memcpy(res, rhs, N * sizeof(*res));
+ cblas_dgemv(CblasColMajor, CblasNoTrans, N, N, -1.0,
+ mat, N, ctx->x, 1, 1.0, res, 1);
+
+ memcpy(res0, res, N * sizeof(*res0));
+ memcpy(p, res, N * sizeof(*p));
+
+ for (i = 0; i < BICGSTAB_MAXITER; i++) {
+ rho = cblas_ddot(N, res, 1, res0, 1);
+
+ if (i) {
+ double beta = (rho / rho_prev) * (alpha / omega);
+
+ cblas_daxpy(N, -omega, v, 1, p, 1);
+ cblas_dscal(N, beta, p, 1);
+ cblas_daxpy(N, 1, res, 1, p, 1);
+ }
+
+ cblas_dgemv(CblasColMajor, CblasNoTrans, N, N, 1.0,
+ k, N, p, 1, 0.0, y, 1);
+
+ cblas_dgemv(CblasColMajor, CblasNoTrans, N, N, 1.0,
+ mat, N, y, 1, 0.0, v, 1);
+
+ alpha = rho / cblas_ddot(N, res0, 1, v, 1);
+
+ cblas_daxpy(N, -alpha, v, 1, res, 1);
+
+ cblas_dgemv(CblasColMajor, CblasNoTrans, N, N, 1.0,
+ k, N, res, 1, 0.0, z, 1);
+ cblas_dgemv(CblasColMajor, CblasNoTrans, N, N, 1.0,
+ mat, N, z, 1, 0.0, t, 1);
+
+ omega = cblas_ddot(N, t, 1, res, 1) / cblas_ddot(N, t, 1, t, 1);
+
+ cblas_daxpy(N, alpha, y, 1, ctx->x, 1);
+ cblas_daxpy(N, omega, z, 1, ctx->x, 1);
+
+ cblas_daxpy(N, -omega, t, 1, res, 1);
+
+ err = cblas_dnrm2(N, res, 1) / rhs_norm;
+ if (err < BICGSTAB_TOL)
+ break;
+
+ rho_prev = rho;
+ }
+ if (i == BICGSTAB_MAXITER)
+ return -1;
+
+ memcpy(x, ctx->x, sizeof(*x) * ctx->N);
+
+ return i;
+}
+
+int md_bicgstab_solve(BiCGStabContext *ctx, const double *mat, const double *rhs, double *x)
+{
+ int ret;
+
+#if HAVE_OPENCL
+ if (ctx->ocl_ctx)
+ ret = solve_cl(ctx, mat, rhs, x);
+ else
+#endif
+ ret = solve_sw(ctx, mat, rhs, x);
+ if (ret < 0)
+ return ret;
+
+#if MD_VERIFY
+ {
+ int i;
+ double *y;
+
+ y = malloc(sizeof(*y) * ctx->N);
+ memcpy(y, rhs, sizeof(*y) * ctx->N);
+ cblas_dgemv(CblasColMajor, CblasNoTrans, ctx->N, ctx->N, -1.0,
+ mat, ctx->N, x, 1, 1.0, y, 1);
+ i = cblas_idamax(ctx->N, y, 1);
+ if (fabs(y[i]) > 1e-11)
+ abort();
+ }
+#endif
+
+ return ret;
+}
+
+int md_bicgstab_init(BiCGStabContext *ctx, const double *k, const double *x0)
+{
+#if HAVE_OPENCL
+ if (ctx->ocl_ctx) {
+ cl_event events[2];
+ clEnqueueWriteBuffer(ctx->ocl_queue, ctx->cl_k, 0, 0, ctx->N * ctx->N * sizeof(double),
+ k, 0, NULL, &events[0]);
+ clEnqueueWriteBuffer(ctx->ocl_queue, ctx->cl_x, 0, 0, ctx->N * sizeof(double),
+ x0, 0, NULL, &events[1]);
+ clWaitForEvents(2, events);
+ } else
+#endif
+ {
+ memcpy(ctx->x, x0, ctx->N * sizeof(*x0));
+ memcpy(ctx->k, k, ctx->N * ctx->N * sizeof(*k));
+ }
+
+ return 0;
+}
+
+int md_bicgstab_context_alloc(BiCGStabContext **pctx, int N,
+ cl_context ocl_ctx, cl_command_queue ocl_q)
+{
+ BiCGStabContext *ctx;
+ int ret = 0;
+
+ ctx = calloc(1, sizeof(*ctx));
+ if (!ctx)
+ return -ENOMEM;
+
+ ctx->N = N;
+
+#if HAVE_OPENCL
+ if (ocl_ctx) {
+ ctx->ocl_ctx = ocl_ctx;
+ ctx->ocl_queue = ocl_q;
+
+#define ALLOC(dst, size) \
+do { \
+ ctx->dst = clCreateBuffer(ocl_ctx, 0, size, NULL, &ret); \
+ if (ret != CL_SUCCESS) \
+ goto fail; \
+} while (0)
+
+ ALLOC(cl_x, N * sizeof(double));
+ ALLOC(cl_p, N * sizeof(double));
+ ALLOC(cl_v, N * sizeof(double));
+ ALLOC(cl_y, N * sizeof(double));
+ ALLOC(cl_z, N * sizeof(double));
+ ALLOC(cl_t, N * sizeof(double));
+ ALLOC(cl_res, N * sizeof(double));
+ ALLOC(cl_res0, N * sizeof(double));
+ ALLOC(cl_tmp, N * sizeof(double));
+ ALLOC(cl_tmp1, N * 2 * sizeof(double));
+
+ ALLOC(cl_k, N * N * sizeof(double));
+ ALLOC(cl_mat, N * N * sizeof(double));
+
+ ALLOC(cl_rho, sizeof(double));
+ ALLOC(cl_alpha, sizeof(double));
+ ALLOC(cl_beta, sizeof(double));
+ ALLOC(cl_omega, 2 * sizeof(double));
+ ALLOC(cl_omega1, sizeof(double));
+ } else
+#endif
+ {
+ ret |= posix_memalign((void**)&ctx->x, 32, sizeof(double) * N);
+ ret |= posix_memalign((void**)&ctx->p, 32, sizeof(double) * N);
+ ret |= posix_memalign((void**)&ctx->v, 32, sizeof(double) * N);
+ ret |= posix_memalign((void**)&ctx->y, 32, sizeof(double) * N);
+ ret |= posix_memalign((void**)&ctx->z, 32, sizeof(double) * N);
+ ret |= posix_memalign((void**)&ctx->t, 32, sizeof(double) * N);
+ ret |= posix_memalign((void**)&ctx->res, 32, sizeof(double) * N);
+ ret |= posix_memalign((void**)&ctx->res0, 32, sizeof(double) * N);
+ ret |= posix_memalign((void**)&ctx->k, 32, sizeof(double) * N * N);
+ }
+
+fail:
+ if (ret) {
+ md_bicgstab_context_free(&ctx);
+ return -ENOMEM;
+ }
+
+ *pctx = ctx;
+ return 0;
+}
+
+void md_bicgstab_context_free(BiCGStabContext **pctx)
+{
+ BiCGStabContext *ctx = *pctx;
+
+ if (!ctx)
+ return;
+
+ free(ctx->x);
+ free(ctx->p);
+ free(ctx->v);
+ free(ctx->y);
+ free(ctx->z);
+ free(ctx->t);
+ free(ctx->res);
+ free(ctx->res0);
+ free(ctx->k);
+
+#if HAVE_OPENCL
+ if (ctx->ocl_ctx) {
+ clReleaseMemObject(ctx->cl_x);
+ clReleaseMemObject(ctx->cl_p);
+ clReleaseMemObject(ctx->cl_v);
+ clReleaseMemObject(ctx->cl_y);
+ clReleaseMemObject(ctx->cl_z);
+ clReleaseMemObject(ctx->cl_t);
+ clReleaseMemObject(ctx->cl_res);
+ clReleaseMemObject(ctx->cl_res0);
+ clReleaseMemObject(ctx->cl_tmp);
+ clReleaseMemObject(ctx->cl_tmp1);
+
+ clReleaseMemObject(ctx->cl_k);
+ clReleaseMemObject(ctx->cl_mat);
+
+ clReleaseMemObject(ctx->cl_rho);
+ clReleaseMemObject(ctx->cl_alpha);
+ clReleaseMemObject(ctx->cl_beta);
+ clReleaseMemObject(ctx->cl_omega);
+ clReleaseMemObject(ctx->cl_omega1);
+ }
+#endif
+
+ free(ctx);
+ *pctx = NULL;
+}
diff --git a/src/bicgstab.h b/src/bicgstab.h
new file mode 100644
index 0000000..70624f4
--- /dev/null
+++ b/src/bicgstab.h
@@ -0,0 +1,60 @@
+/*
+ * BiCGStab iterative linear system solver
+ * Copyright (C) 2016 Anton Khirnov <anton@khirnov.net>
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef MD_BICGSTAB_H
+#define MD_BICGSTAB_H
+
+#include "common.h"
+
+#if HAVE_OPENCL
+#include <cl.h>
+#else
+typedef void* cl_context;
+typedef void* cl_command_queue;
+#endif
+
+typedef struct BiCGStabContext BiCGStabContext;
+
+/**
+ * Allocate and initialize the solver for the NxN system.
+ *
+ * If the OpenCL context and command queue are provided (non-NULL), the solver
+ * will run using clBLAS.
+ */
+int md_bicgstab_context_alloc(BiCGStabContext **ctx, int N,
+ cl_context ocl_ctx, cl_command_queue ocl_q);
+
+/**
+ * Free the solver and all its internal state.
+ */
+void md_bicgstab_context_free(BiCGStabContext **ctx);
+
+/**
+ * Initialise the solver with the given preconditioner matrix. This function
+ * may be any number of times on a given solver context.
+ */
+int md_bicgstab_init(BiCGStabContext *ctx, const double *k, const double *x0);
+
+/**
+ * Solve the linear system
+ * mat · x = rhs
+ * The result is written into x.
+ */
+int md_bicgstab_solve(BiCGStabContext *ctx, const double *mat, const double *rhs, double *x);
+
+#endif /* MD_BICGSTAB_H */
diff --git a/src/common.h b/src/common.h
new file mode 100644
index 0000000..2b1ebf6
--- /dev/null
+++ b/src/common.h
@@ -0,0 +1,29 @@
+#ifndef MD_COMMON_H
+#define MD_COMMON_H
+
+#define HAVE_OPENCL 0
+#define MD_VERIFY 0
+#define MD_POLAR 0
+
+#define SQR(x) ((x) * (x))
+#define SGN(x) ((x) >= 0.0 ? 1.0 : -1.0)
+#define MAX(x, y) ((x) > (y) ? (x) : (y))
+#define MIN(x, y) ((x) > (y) ? (y) : (x))
+#define ARRAY_ELEMS(arr) (sizeof(arr) / sizeof(*arr))
+
+/*
+ * small number to avoid r=0 singularities
+ */
+#define EPS 1E-08
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <sys/time.h>
+static inline int64_t gettime(void)
+{
+ struct timeval tv;
+ gettimeofday(&tv, NULL);
+ return (int64_t)tv.tv_sec * 1000000 + tv.tv_usec;
+}
+
+#endif /* MD_COMMON_H */
diff --git a/src/config.asm b/src/config.asm
new file mode 100644
index 0000000..0ee0ca2
--- /dev/null
+++ b/src/config.asm
@@ -0,0 +1,1325 @@
+%define ARCH_AARCH64 0
+%define ARCH_ALPHA 0
+%define ARCH_ARM 0
+%define ARCH_AVR32 0
+%define ARCH_AVR32_AP 0
+%define ARCH_AVR32_UC 0
+%define ARCH_BFIN 0
+%define ARCH_IA64 0
+%define ARCH_M68K 0
+%define ARCH_MIPS 0
+%define ARCH_MIPS64 0
+%define ARCH_PARISC 0
+%define ARCH_PPC 0
+%define ARCH_PPC64 0
+%define ARCH_S390 0
+%define ARCH_SH4 0
+%define ARCH_SPARC 0
+%define ARCH_SPARC64 0
+%define ARCH_TILEGX 0
+%define ARCH_TILEPRO 0
+%define ARCH_TOMI 0
+%define ARCH_X86 1
+%define ARCH_X86_32 0
+%define ARCH_X86_64 1
+%define HAVE_ARMV5TE 0
+%define HAVE_ARMV6 0
+%define HAVE_ARMV6T2 0
+%define HAVE_ARMV8 0
+%define HAVE_NEON 0
+%define HAVE_VFP 0
+%define HAVE_VFPV3 0
+%define HAVE_ALTIVEC 0
+%define HAVE_DCBZL 1
+%define HAVE_LDBRX 1
+%define HAVE_PPC4XX 0
+%define HAVE_AMD3DNOW 1
+%define HAVE_AMD3DNOWEXT 1
+%define HAVE_AVX 1
+%define HAVE_AVX2 1
+%define HAVE_FMA3 1
+%define HAVE_FMA4 1
+%define HAVE_MMX 1
+%define HAVE_MMXEXT 1
+%define HAVE_SSE 1
+%define HAVE_SSE2 1
+%define HAVE_SSE3 1
+%define HAVE_SSE4 1
+%define HAVE_SSE42 1
+%define HAVE_SSSE3 1
+%define HAVE_XOP 1
+%define HAVE_CPUNOP 1
+%define HAVE_I686 1
+%define HAVE_LOONGSON 1
+%define HAVE_VIS 1
+%define HAVE_ARMV5TE_EXTERNAL 0
+%define HAVE_ARMV6_EXTERNAL 0
+%define HAVE_ARMV6T2_EXTERNAL 0
+%define HAVE_ARMV8_EXTERNAL 0
+%define HAVE_NEON_EXTERNAL 0
+%define HAVE_VFP_EXTERNAL 0
+%define HAVE_VFPV3_EXTERNAL 0
+%define HAVE_ALTIVEC_EXTERNAL 0
+%define HAVE_DCBZL_EXTERNAL 0
+%define HAVE_LDBRX_EXTERNAL 0
+%define HAVE_PPC4XX_EXTERNAL 0
+%define HAVE_AMD3DNOW_EXTERNAL 1
+%define HAVE_AMD3DNOWEXT_EXTERNAL 1
+%define HAVE_AVX_EXTERNAL 1
+%define HAVE_AVX2_EXTERNAL 1
+%define HAVE_FMA3_EXTERNAL 1
+%define HAVE_FMA4_EXTERNAL 1
+%define HAVE_MMX_EXTERNAL 1
+%define HAVE_MMXEXT_EXTERNAL 1
+%define HAVE_SSE_EXTERNAL 1
+%define HAVE_SSE2_EXTERNAL 1
+%define HAVE_SSE3_EXTERNAL 1
+%define HAVE_SSE4_EXTERNAL 1
+%define HAVE_SSE42_EXTERNAL 1
+%define HAVE_SSSE3_EXTERNAL 1
+%define HAVE_XOP_EXTERNAL 1
+%define HAVE_CPUNOP_EXTERNAL 0
+%define HAVE_I686_EXTERNAL 0
+%define HAVE_LOONGSON_EXTERNAL 0
+%define HAVE_VIS_EXTERNAL 0
+%define HAVE_ARMV5TE_INLINE 0
+%define HAVE_ARMV6_INLINE 0
+%define HAVE_ARMV6T2_INLINE 0
+%define HAVE_ARMV8_INLINE 0
+%define HAVE_NEON_INLINE 0
+%define HAVE_VFP_INLINE 0
+%define HAVE_VFPV3_INLINE 0
+%define HAVE_ALTIVEC_INLINE 0
+%define HAVE_DCBZL_INLINE 0
+%define HAVE_LDBRX_INLINE 0
+%define HAVE_PPC4XX_INLINE 0
+%define HAVE_AMD3DNOW_INLINE 1
+%define HAVE_AMD3DNOWEXT_INLINE 1
+%define HAVE_AVX_INLINE 1
+%define HAVE_AVX2_INLINE 1
+%define HAVE_FMA3_INLINE 1
+%define HAVE_FMA4_INLINE 1
+%define HAVE_MMX_INLINE 1
+%define HAVE_MMXEXT_INLINE 1
+%define HAVE_SSE_INLINE 1
+%define HAVE_SSE2_INLINE 1
+%define HAVE_SSE3_INLINE 1
+%define HAVE_SSE4_INLINE 1
+%define HAVE_SSE42_INLINE 1
+%define HAVE_SSSE3_INLINE 1
+%define HAVE_XOP_INLINE 1
+%define HAVE_CPUNOP_INLINE 0
+%define HAVE_I686_INLINE 0
+%define HAVE_LOONGSON_INLINE 0
+%define HAVE_VIS_INLINE 0
+%define HAVE_ALIGNED_STACK 1
+%define HAVE_FAST_64BIT 1
+%define HAVE_FAST_CLZ 1
+%define HAVE_FAST_CMOV 1
+%define HAVE_LOCAL_ALIGNED_8 1
+%define HAVE_LOCAL_ALIGNED_16 1
+%define HAVE_SIMD_ALIGN_16 1
+%define HAVE_ATOMICS_GCC 1
+%define HAVE_ATOMICS_SUNCC 0
+%define HAVE_ATOMICS_WIN32 0
+%define HAVE_ATOMIC_CAS_PTR 0
+%define HAVE_MACHINE_RW_BARRIER 0
+%define HAVE_MEMORYBARRIER 0
+%define HAVE_MM_EMPTY 1
+%define HAVE_RDTSC 0
+%define HAVE_SYNC_VAL_COMPARE_AND_SWAP 1
+%define HAVE_INLINE_ASM 1
+%define HAVE_SYMVER 1
+%define HAVE_YASM 1
+%define HAVE_BIGENDIAN 0
+%define HAVE_FAST_UNALIGNED 1
+%define HAVE_ALSA_ASOUNDLIB_H 1
+%define HAVE_ALTIVEC_H 0
+%define HAVE_ARPA_INET_H 1
+%define HAVE_CDIO_PARANOIA_H 0
+%define HAVE_CDIO_PARANOIA_PARANOIA_H 0
+%define HAVE_DEV_BKTR_IOCTL_BT848_H 0
+%define HAVE_DEV_BKTR_IOCTL_METEOR_H 0
+%define HAVE_DEV_IC_BT8XX_H 0
+%define HAVE_DEV_VIDEO_BKTR_IOCTL_BT848_H 0
+%define HAVE_DEV_VIDEO_METEOR_IOCTL_METEOR_H 0
+%define HAVE_DIRECT_H 0
+%define HAVE_DLFCN_H 1
+%define HAVE_DXVA_H 0
+%define HAVE_GSM_H 0
+%define HAVE_IO_H 0
+%define HAVE_MACH_MACH_TIME_H 0
+%define HAVE_MACHINE_IOCTL_BT848_H 0
+%define HAVE_MACHINE_IOCTL_METEOR_H 0
+%define HAVE_MALLOC_H 1
+%define HAVE_POLL_H 1
+%define HAVE_SNDIO_H 0
+%define HAVE_SOUNDCARD_H 0
+%define HAVE_SYS_MMAN_H 1
+%define HAVE_SYS_PARAM_H 1
+%define HAVE_SYS_RESOURCE_H 1
+%define HAVE_SYS_SELECT_H 1
+%define HAVE_SYS_SOUNDCARD_H 1
+%define HAVE_SYS_TIME_H 1
+%define HAVE_SYS_UN_H 1
+%define HAVE_SYS_VIDEOIO_H 0
+%define HAVE_UNISTD_H 1
+%define HAVE_WINDOWS_H 0
+%define HAVE_WINSOCK2_H 0
+%define HAVE_INTRINSICS_NEON 0
+%define HAVE_ATANF 1
+%define HAVE_ATAN2F 1
+%define HAVE_CBRTF 1
+%define HAVE_COSF 1
+%define HAVE_EXP2 1
+%define HAVE_EXP2F 1
+%define HAVE_EXPF 1
+%define HAVE_ISINF 1
+%define HAVE_ISNAN 1
+%define HAVE_LDEXPF 1
+%define HAVE_LLRINT 1
+%define HAVE_LLRINTF 1
+%define HAVE_LOG2 1
+%define HAVE_LOG2F 1
+%define HAVE_LOG10F 1
+%define HAVE_LRINT 1
+%define HAVE_LRINTF 1
+%define HAVE_POWF 1
+%define HAVE_RINT 1
+%define HAVE_ROUND 1
+%define HAVE_ROUNDF 1
+%define HAVE_SINF 1
+%define HAVE_TRUNC 1
+%define HAVE_TRUNCF 1
+%define HAVE_ALIGNED_MALLOC 0
+%define HAVE_CLOSESOCKET 0
+%define HAVE_COMMANDLINETOARGVW 0
+%define HAVE_COTASKMEMFREE 0
+%define HAVE_CRYPTGENRANDOM 0
+%define HAVE_DLOPEN 1
+%define HAVE_FCNTL 1
+%define HAVE_FLT_LIM 1
+%define HAVE_FORK 1
+%define HAVE_GETADDRINFO 1
+%define HAVE_GETHRTIME 0
+%define HAVE_GETOPT 1
+%define HAVE_GETPROCESSAFFINITYMASK 0
+%define HAVE_GETPROCESSMEMORYINFO 0
+%define HAVE_GETPROCESSTIMES 0
+%define HAVE_GETRUSAGE 1
+%define HAVE_GETSERVBYPORT 1
+%define HAVE_GETSYSTEMTIMEASFILETIME 0
+%define HAVE_GETTIMEOFDAY 1
+%define HAVE_INET_ATON 1
+%define HAVE_ISATTY 1
+%define HAVE_JACK_PORT_GET_LATENCY_RANGE 0
+%define HAVE_LOCALTIME_R 1
+%define HAVE_MACH_ABSOLUTE_TIME 0
+%define HAVE_MAPVIEWOFFILE 0
+%define HAVE_MEMALIGN 1
+%define HAVE_MKSTEMP 1
+%define HAVE_MMAP 1
+%define HAVE_MPROTECT 1
+%define HAVE_NANOSLEEP 1
+%define HAVE_POSIX_MEMALIGN 1
+%define HAVE_SCHED_GETAFFINITY 1
+%define HAVE_SETCONSOLETEXTATTRIBUTE 0
+%define HAVE_SETMODE 0
+%define HAVE_SETRLIMIT 1
+%define HAVE_SLEEP 0
+%define HAVE_STRERROR_R 1
+%define HAVE_STRPTIME 1
+%define HAVE_SYSCONF 1
+%define HAVE_SYSCTL 1
+%define HAVE_USLEEP 1
+%define HAVE_VIRTUALALLOC 0
+%define HAVE_PTHREADS 1
+%define HAVE_W32THREADS 0
+%define HAVE_AS_DN_DIRECTIVE 0
+%define HAVE_AS_FUNC 1
+%define HAVE_ASM_MOD_Q 0
+%define HAVE_ATTRIBUTE_MAY_ALIAS 1
+%define HAVE_ATTRIBUTE_PACKED 1
+%define HAVE_EBP_AVAILABLE 0
+%define HAVE_EBX_AVAILABLE 1
+%define HAVE_GNU_AS 1
+%define HAVE_IBM_ASM 0
+%define HAVE_INLINE_ASM_LABELS 1
+%define HAVE_PRAGMA_DEPRECATED 1
+%define HAVE_SYMVER_ASM_LABEL 0
+%define HAVE_SYMVER_GNU_ASM 1
+%define HAVE_VFP_ARGS 0
+%define HAVE_XFORM_ASM 0
+%define HAVE_XMM_CLOBBERS 1
+%define HAVE_SOCKLEN_T 1
+%define HAVE_STRUCT_ADDRINFO 1
+%define HAVE_STRUCT_GROUP_SOURCE_REQ 1
+%define HAVE_STRUCT_IP_MREQ_SOURCE 1
+%define HAVE_STRUCT_IPV6_MREQ 1
+%define HAVE_STRUCT_POLLFD 1
+%define HAVE_STRUCT_RUSAGE_RU_MAXRSS 1
+%define HAVE_STRUCT_SOCKADDR_IN6 1
+%define HAVE_STRUCT_SOCKADDR_SA_LEN 0
+%define HAVE_STRUCT_SOCKADDR_STORAGE 1
+%define HAVE_STRUCT_V4L2_FRMIVALENUM_DISCRETE 1
+%define HAVE_ATOMICS_NATIVE 1
+%define HAVE_DOS_PATHS 0
+%define HAVE_DXVA2_LIB 0
+%define HAVE_LIBC_MSVCRT 0
+%define HAVE_LIBDC1394_1 0
+%define HAVE_LIBDC1394_2 0
+%define HAVE_SDL 0
+%define HAVE_THREADS 1
+%define HAVE_VDPAU_X11 0
+%define HAVE_XLIB 1
+%define CONFIG_BSFS 1
+%define CONFIG_DECODERS 1
+%define CONFIG_DEMUXERS 1
+%define CONFIG_ENCODERS 1
+%define CONFIG_FILTERS 1
+%define CONFIG_HWACCELS 0
+%define CONFIG_INDEVS 1
+%define CONFIG_MUXERS 1
+%define CONFIG_OUTDEVS 1
+%define CONFIG_PARSERS 1
+%define CONFIG_PROTOCOLS 1
+%define CONFIG_AVCODEC_EXAMPLE 1
+%define CONFIG_FILTER_AUDIO_EXAMPLE 1
+%define CONFIG_METADATA_EXAMPLE 1
+%define CONFIG_OUTPUT_EXAMPLE 1
+%define CONFIG_TRANSCODE_AAC_EXAMPLE 1
+%define CONFIG_AVISYNTH 0
+%define CONFIG_BZLIB 1
+%define CONFIG_FREI0R 0
+%define CONFIG_GNUTLS 0
+%define CONFIG_LIBBS2B 0
+%define CONFIG_LIBCDIO 0
+%define CONFIG_LIBDC1394 0
+%define CONFIG_LIBFAAC 0
+%define CONFIG_LIBFDK_AAC 0
+%define CONFIG_LIBFONTCONFIG 0
+%define CONFIG_LIBFREETYPE 0
+%define CONFIG_LIBGSM 0
+%define CONFIG_LIBILBC 0
+%define CONFIG_LIBMP3LAME 0
+%define CONFIG_LIBOPENCORE_AMRNB 0
+%define CONFIG_LIBOPENCORE_AMRWB 0
+%define CONFIG_LIBOPENCV 0
+%define CONFIG_LIBOPENJPEG 0
+%define CONFIG_LIBOPUS 0
+%define CONFIG_LIBPULSE 0
+%define CONFIG_LIBRTMP 0
+%define CONFIG_LIBSCHROEDINGER 0
+%define CONFIG_LIBSPEEX 0
+%define CONFIG_LIBTHEORA 0
+%define CONFIG_LIBTWOLAME 0
+%define CONFIG_LIBVO_AACENC 0
+%define CONFIG_LIBVO_AMRWBENC 0
+%define CONFIG_LIBVORBIS 0
+%define CONFIG_LIBVPX 0
+%define CONFIG_LIBWAVPACK 0
+%define CONFIG_LIBWEBP 0
+%define CONFIG_LIBX264 0
+%define CONFIG_LIBX265 0
+%define CONFIG_LIBXAVS 0
+%define CONFIG_LIBXVID 0
+%define CONFIG_OPENSSL 0
+%define CONFIG_X11GRAB 0
+%define CONFIG_ZLIB 1
+%define CONFIG_GRAY 0
+%define CONFIG_HARDCODED_TABLES 0
+%define CONFIG_RUNTIME_CPUDETECT 0
+%define CONFIG_SAFE_BITSTREAM_READER 1
+%define CONFIG_SHARED 0
+%define CONFIG_SMALL 0
+%define CONFIG_SRAM 0
+%define CONFIG_STATIC 1
+%define CONFIG_SWSCALE_ALPHA 1
+%define CONFIG_DXVA2 0
+%define CONFIG_VAAPI 0
+%define CONFIG_VDA 0
+%define CONFIG_VDPAU 0
+%define CONFIG_GPL 0
+%define CONFIG_NONFREE 0
+%define CONFIG_VERSION3 0
+%define CONFIG_AVCODEC 1
+%define CONFIG_AVDEVICE 1
+%define CONFIG_AVFILTER 1
+%define CONFIG_AVFORMAT 1
+%define CONFIG_AVRESAMPLE 1
+%define CONFIG_AVUTIL 1
+%define CONFIG_SWSCALE 1
+%define CONFIG_AVCONV 1
+%define CONFIG_AVPLAY 0
+%define CONFIG_AVPROBE 1
+%define CONFIG_DCT 1
+%define CONFIG_DOC 1
+%define CONFIG_ERROR_RESILIENCE 1
+%define CONFIG_FFT 1
+%define CONFIG_LSP 1
+%define CONFIG_LZO 1
+%define CONFIG_MDCT 1
+%define CONFIG_NETWORK 1
+%define CONFIG_RDFT 1
+%define CONFIG_MEMALIGN_HACK 0
+%define CONFIG_NEON_CLOBBER_TEST 0
+%define CONFIG_PIC 0
+%define CONFIG_POD2MAN 1
+%define CONFIG_TEXI2HTML 0
+%define CONFIG_THUMB 0
+%define CONFIG_XMM_CLOBBER_TEST 0
+%define CONFIG_AANDCTTABLES 1
+%define CONFIG_AC3DSP 1
+%define CONFIG_AUDIO_FRAME_QUEUE 1
+%define CONFIG_AUDIODSP 1
+%define CONFIG_BLOCKDSP 1
+%define CONFIG_BSWAPDSP 1
+%define CONFIG_CABAC 1
+%define CONFIG_DVPROFILE 1
+%define CONFIG_FDCTDSP 1
+%define CONFIG_GCRYPT 0
+%define CONFIG_GOLOMB 1
+%define CONFIG_GPLV3 0
+%define CONFIG_H263DSP 1
+%define CONFIG_H264CHROMA 1
+%define CONFIG_H264DSP 1
+%define CONFIG_H264PRED 1
+%define CONFIG_H264QPEL 1
+%define CONFIG_HPELDSP 1
+%define CONFIG_HUFFMAN 1
+%define CONFIG_HUFFYUVDSP 1
+%define CONFIG_HUFFYUVENCDSP 1
+%define CONFIG_IDCTDSP 1
+%define CONFIG_IIRFILTER 1
+%define CONFIG_INTRAX8 1
+%define CONFIG_LGPLV3 0
+%define CONFIG_LPC 1
+%define CONFIG_ME_CMP 1
+%define CONFIG_MPEG_ER 1
+%define CONFIG_MPEGAUDIO 1
+%define CONFIG_MPEGAUDIODSP 1
+%define CONFIG_MPEGVIDEO 1
+%define CONFIG_MPEGVIDEOENC 1
+%define CONFIG_NETTLE 0
+%define CONFIG_PIXBLOCKDSP 1
+%define CONFIG_QPELDSP 1
+%define CONFIG_RANGECODER 1
+%define CONFIG_RIFFDEC 1
+%define CONFIG_RIFFENC 1
+%define CONFIG_RTPDEC 1
+%define CONFIG_RTPENC_CHAIN 1
+%define CONFIG_SINEWIN 1
+%define CONFIG_TPELDSP 1
+%define CONFIG_VIDEODSP 1
+%define CONFIG_VP3DSP 1
+%define CONFIG_AAC_ADTSTOASC_BSF 1
+%define CONFIG_CHOMP_BSF 1
+%define CONFIG_DUMP_EXTRADATA_BSF 1
+%define CONFIG_H264_MP4TOANNEXB_BSF 1
+%define CONFIG_IMX_DUMP_HEADER_BSF 1
+%define CONFIG_MJPEG2JPEG_BSF 1
+%define CONFIG_MJPEGA_DUMP_HEADER_BSF 1
+%define CONFIG_MOV2TEXTSUB_BSF 1
+%define CONFIG_NOISE_BSF 1
+%define CONFIG_REMOVE_EXTRADATA_BSF 1
+%define CONFIG_TEXT2MOVSUB_BSF 1
+%define CONFIG_AASC_DECODER 1
+%define CONFIG_AIC_DECODER 1
+%define CONFIG_ALIAS_PIX_DECODER 1
+%define CONFIG_AMV_DECODER 1
+%define CONFIG_ANM_DECODER 1
+%define CONFIG_ANSI_DECODER 1
+%define CONFIG_ASV1_DECODER 1
+%define CONFIG_ASV2_DECODER 1
+%define CONFIG_AURA_DECODER 1
+%define CONFIG_AURA2_DECODER 1
+%define CONFIG_AVS_DECODER 1
+%define CONFIG_BETHSOFTVID_DECODER 1
+%define CONFIG_BFI_DECODER 1
+%define CONFIG_BINK_DECODER 1
+%define CONFIG_BMP_DECODER 1
+%define CONFIG_BMV_VIDEO_DECODER 1
+%define CONFIG_BRENDER_PIX_DECODER 1
+%define CONFIG_C93_DECODER 1
+%define CONFIG_CAVS_DECODER 1
+%define CONFIG_CDGRAPHICS_DECODER 1
+%define CONFIG_CDXL_DECODER 1
+%define CONFIG_CINEPAK_DECODER 1
+%define CONFIG_CLJR_DECODER 1
+%define CONFIG_CLLC_DECODER 1
+%define CONFIG_COMFORTNOISE_DECODER 1
+%define CONFIG_CSCD_DECODER 1
+%define CONFIG_CYUV_DECODER 1
+%define CONFIG_DFA_DECODER 1
+%define CONFIG_DNXHD_DECODER 1
+%define CONFIG_DPX_DECODER 1
+%define CONFIG_DSICINVIDEO_DECODER 1
+%define CONFIG_DVVIDEO_DECODER 1
+%define CONFIG_DXA_DECODER 1
+%define CONFIG_DXTORY_DECODER 1
+%define CONFIG_EACMV_DECODER 1
+%define CONFIG_EAMAD_DECODER 1
+%define CONFIG_EATGQ_DECODER 1
+%define CONFIG_EATGV_DECODER 1
+%define CONFIG_EATQI_DECODER 1
+%define CONFIG_EIGHTBPS_DECODER 1
+%define CONFIG_EIGHTSVX_EXP_DECODER 1
+%define CONFIG_EIGHTSVX_FIB_DECODER 1
+%define CONFIG_ESCAPE124_DECODER 1
+%define CONFIG_ESCAPE130_DECODER 1
+%define CONFIG_EXR_DECODER 1
+%define CONFIG_FFV1_DECODER 1
+%define CONFIG_FFVHUFF_DECODER 1
+%define CONFIG_FIC_DECODER 1
+%define CONFIG_FLASHSV_DECODER 1
+%define CONFIG_FLASHSV2_DECODER 1
+%define CONFIG_FLIC_DECODER 1
+%define CONFIG_FLV_DECODER 1
+%define CONFIG_FOURXM_DECODER 1
+%define CONFIG_FRAPS_DECODER 1
+%define CONFIG_FRWU_DECODER 1
+%define CONFIG_G2M_DECODER 1
+%define CONFIG_GIF_DECODER 1
+%define CONFIG_H261_DECODER 1
+%define CONFIG_H263_DECODER 1
+%define CONFIG_H263I_DECODER 1
+%define CONFIG_H264_DECODER 1
+%define CONFIG_HEVC_DECODER 1
+%define CONFIG_HNM4_VIDEO_DECODER 1
+%define CONFIG_HUFFYUV_DECODER 1
+%define CONFIG_IDCIN_DECODER 1
+%define CONFIG_IFF_BYTERUN1_DECODER 1
+%define CONFIG_IFF_ILBM_DECODER 1
+%define CONFIG_INDEO2_DECODER 1
+%define CONFIG_INDEO3_DECODER 1
+%define CONFIG_INDEO4_DECODER 1
+%define CONFIG_INDEO5_DECODER 1
+%define CONFIG_INTERPLAY_VIDEO_DECODER 1
+%define CONFIG_JPEG2000_DECODER 1
+%define CONFIG_JPEGLS_DECODER 1
+%define CONFIG_JV_DECODER 1
+%define CONFIG_KGV1_DECODER 1
+%define CONFIG_KMVC_DECODER 1
+%define CONFIG_LAGARITH_DECODER 1
+%define CONFIG_LOCO_DECODER 1
+%define CONFIG_MDEC_DECODER 1
+%define CONFIG_MIMIC_DECODER 1
+%define CONFIG_MJPEG_DECODER 1
+%define CONFIG_MJPEGB_DECODER 1
+%define CONFIG_MMVIDEO_DECODER 1
+%define CONFIG_MOTIONPIXELS_DECODER 1
+%define CONFIG_MPEG_XVMC_DECODER 0
+%define CONFIG_MPEG1VIDEO_DECODER 1
+%define CONFIG_MPEG2VIDEO_DECODER 1
+%define CONFIG_MPEG4_DECODER 1
+%define CONFIG_MSA1_DECODER 1
+%define CONFIG_MSMPEG4V1_DECODER 1
+%define CONFIG_MSMPEG4V2_DECODER 1
+%define CONFIG_MSMPEG4V3_DECODER 1
+%define CONFIG_MSRLE_DECODER 1
+%define CONFIG_MSS1_DECODER 1
+%define CONFIG_MSS2_DECODER 1
+%define CONFIG_MSVIDEO1_DECODER 1
+%define CONFIG_MSZH_DECODER 1
+%define CONFIG_MTS2_DECODER 1
+%define CONFIG_MVC1_DECODER 1
+%define CONFIG_MVC2_DECODER 1
+%define CONFIG_MXPEG_DECODER 1
+%define CONFIG_NUV_DECODER 1
+%define CONFIG_PAF_VIDEO_DECODER 1
+%define CONFIG_PAM_DECODER 1
+%define CONFIG_PBM_DECODER 1
+%define CONFIG_PCX_DECODER 1
+%define CONFIG_PGM_DECODER 1
+%define CONFIG_PGMYUV_DECODER 1
+%define CONFIG_PICTOR_DECODER 1
+%define CONFIG_PNG_DECODER 1
+%define CONFIG_PPM_DECODER 1
+%define CONFIG_PRORES_DECODER 1
+%define CONFIG_PTX_DECODER 1
+%define CONFIG_QDRAW_DECODER 1
+%define CONFIG_QPEG_DECODER 1
+%define CONFIG_QTRLE_DECODER 1
+%define CONFIG_R10K_DECODER 1
+%define CONFIG_R210_DECODER 1
+%define CONFIG_RAWVIDEO_DECODER 1
+%define CONFIG_RL2_DECODER 1
+%define CONFIG_ROQ_DECODER 1
+%define CONFIG_RPZA_DECODER 1
+%define CONFIG_RV10_DECODER 1
+%define CONFIG_RV20_DECODER 1
+%define CONFIG_RV30_DECODER 1
+%define CONFIG_RV40_DECODER 1
+%define CONFIG_S302M_DECODER 1
+%define CONFIG_SANM_DECODER 1
+%define CONFIG_SGI_DECODER 1
+%define CONFIG_SGIRLE_DECODER 1
+%define CONFIG_SMACKER_DECODER 1
+%define CONFIG_SMC_DECODER 1
+%define CONFIG_SP5X_DECODER 1
+%define CONFIG_SUNRAST_DECODER 1
+%define CONFIG_SVQ1_DECODER 1
+%define CONFIG_SVQ3_DECODER 1
+%define CONFIG_TARGA_DECODER 1
+%define CONFIG_THEORA_DECODER 1
+%define CONFIG_THP_DECODER 1
+%define CONFIG_TIERTEXSEQVIDEO_DECODER 1
+%define CONFIG_TIFF_DECODER 1
+%define CONFIG_TMV_DECODER 1
+%define CONFIG_TRUEMOTION1_DECODER 1
+%define CONFIG_TRUEMOTION2_DECODER 1
+%define CONFIG_TSCC_DECODER 1
+%define CONFIG_TSCC2_DECODER 1
+%define CONFIG_TXD_DECODER 1
+%define CONFIG_ULTI_DECODER 1
+%define CONFIG_UTVIDEO_DECODER 1
+%define CONFIG_V210_DECODER 1
+%define CONFIG_V210X_DECODER 1
+%define CONFIG_V410_DECODER 1
+%define CONFIG_VB_DECODER 1
+%define CONFIG_VBLE_DECODER 1
+%define CONFIG_VC1_DECODER 1
+%define CONFIG_VC1IMAGE_DECODER 1
+%define CONFIG_VCR1_DECODER 1
+%define CONFIG_VMDVIDEO_DECODER 1
+%define CONFIG_VMNC_DECODER 1
+%define CONFIG_VP3_DECODER 1
+%define CONFIG_VP5_DECODER 1
+%define CONFIG_VP6_DECODER 1
+%define CONFIG_VP6A_DECODER 1
+%define CONFIG_VP6F_DECODER 1
+%define CONFIG_VP7_DECODER 1
+%define CONFIG_VP8_DECODER 1
+%define CONFIG_VP9_DECODER 1
+%define CONFIG_VQA_DECODER 1
+%define CONFIG_WEBP_DECODER 1
+%define CONFIG_WMV1_DECODER 1
+%define CONFIG_WMV2_DECODER 1
+%define CONFIG_WMV3_DECODER 1
+%define CONFIG_WMV3IMAGE_DECODER 1
+%define CONFIG_WNV1_DECODER 1
+%define CONFIG_XAN_WC3_DECODER 1
+%define CONFIG_XAN_WC4_DECODER 1
+%define CONFIG_XBM_DECODER 1
+%define CONFIG_XL_DECODER 1
+%define CONFIG_XWD_DECODER 1
+%define CONFIG_YOP_DECODER 1
+%define CONFIG_ZEROCODEC_DECODER 1
+%define CONFIG_ZLIB_DECODER 1
+%define CONFIG_ZMBV_DECODER 1
+%define CONFIG_AAC_DECODER 1
+%define CONFIG_AAC_LATM_DECODER 1
+%define CONFIG_AC3_DECODER 1
+%define CONFIG_ALAC_DECODER 1
+%define CONFIG_ALS_DECODER 1
+%define CONFIG_AMRNB_DECODER 1
+%define CONFIG_AMRWB_DECODER 1
+%define CONFIG_APE_DECODER 1
+%define CONFIG_ATRAC1_DECODER 1
+%define CONFIG_ATRAC3_DECODER 1
+%define CONFIG_ATRAC3P_DECODER 1
+%define CONFIG_BINKAUDIO_DCT_DECODER 1
+%define CONFIG_BINKAUDIO_RDFT_DECODER 1
+%define CONFIG_BMV_AUDIO_DECODER 1
+%define CONFIG_COOK_DECODER 1
+%define CONFIG_DCA_DECODER 1
+%define CONFIG_DSICINAUDIO_DECODER 1
+%define CONFIG_EAC3_DECODER 1
+%define CONFIG_FLAC_DECODER 1
+%define CONFIG_G723_1_DECODER 1
+%define CONFIG_GSM_DECODER 1
+%define CONFIG_GSM_MS_DECODER 1
+%define CONFIG_IAC_DECODER 1
+%define CONFIG_IMC_DECODER 1
+%define CONFIG_MACE3_DECODER 1
+%define CONFIG_MACE6_DECODER 1
+%define CONFIG_METASOUND_DECODER 1
+%define CONFIG_MLP_DECODER 1
+%define CONFIG_MP1_DECODER 1
+%define CONFIG_MP1FLOAT_DECODER 1
+%define CONFIG_MP2_DECODER 1
+%define CONFIG_MP2FLOAT_DECODER 1
+%define CONFIG_MP3_DECODER 1
+%define CONFIG_MP3FLOAT_DECODER 1
+%define CONFIG_MP3ADU_DECODER 1
+%define CONFIG_MP3ADUFLOAT_DECODER 1
+%define CONFIG_MP3ON4_DECODER 1
+%define CONFIG_MP3ON4FLOAT_DECODER 1
+%define CONFIG_MPC7_DECODER 1
+%define CONFIG_MPC8_DECODER 1
+%define CONFIG_NELLYMOSER_DECODER 1
+%define CONFIG_ON2AVC_DECODER 1
+%define CONFIG_OPUS_DECODER 1
+%define CONFIG_PAF_AUDIO_DECODER 1
+%define CONFIG_QCELP_DECODER 1
+%define CONFIG_QDM2_DECODER 1
+%define CONFIG_RA_144_DECODER 1
+%define CONFIG_RA_288_DECODER 1
+%define CONFIG_RALF_DECODER 1
+%define CONFIG_SHORTEN_DECODER 1
+%define CONFIG_SIPR_DECODER 1
+%define CONFIG_SMACKAUD_DECODER 1
+%define CONFIG_TAK_DECODER 1
+%define CONFIG_TRUEHD_DECODER 1
+%define CONFIG_TRUESPEECH_DECODER 1
+%define CONFIG_TTA_DECODER 1
+%define CONFIG_TWINVQ_DECODER 1
+%define CONFIG_VMDAUDIO_DECODER 1
+%define CONFIG_VORBIS_DECODER 1
+%define CONFIG_WAVPACK_DECODER 1
+%define CONFIG_WMALOSSLESS_DECODER 1
+%define CONFIG_WMAPRO_DECODER 1
+%define CONFIG_WMAV1_DECODER 1
+%define CONFIG_WMAV2_DECODER 1
+%define CONFIG_WMAVOICE_DECODER 1
+%define CONFIG_WS_SND1_DECODER 1
+%define CONFIG_PCM_ALAW_DECODER 1
+%define CONFIG_PCM_BLURAY_DECODER 1
+%define CONFIG_PCM_DVD_DECODER 1
+%define CONFIG_PCM_F32BE_DECODER 1
+%define CONFIG_PCM_F32LE_DECODER 1
+%define CONFIG_PCM_F64BE_DECODER 1
+%define CONFIG_PCM_F64LE_DECODER 1
+%define CONFIG_PCM_LXF_DECODER 1
+%define CONFIG_PCM_MULAW_DECODER 1
+%define CONFIG_PCM_S8_DECODER 1
+%define CONFIG_PCM_S8_PLANAR_DECODER 1
+%define CONFIG_PCM_S16BE_DECODER 1
+%define CONFIG_PCM_S16LE_DECODER 1
+%define CONFIG_PCM_S16LE_PLANAR_DECODER 1
+%define CONFIG_PCM_S24BE_DECODER 1
+%define CONFIG_PCM_S24DAUD_DECODER 1
+%define CONFIG_PCM_S24LE_DECODER 1
+%define CONFIG_PCM_S24LE_PLANAR_DECODER 1
+%define CONFIG_PCM_S32BE_DECODER 1
+%define CONFIG_PCM_S32LE_DECODER 1
+%define CONFIG_PCM_S32LE_PLANAR_DECODER 1
+%define CONFIG_PCM_U8_DECODER 1
+%define CONFIG_PCM_U16BE_DECODER 1
+%define CONFIG_PCM_U16LE_DECODER 1
+%define CONFIG_PCM_U24BE_DECODER 1
+%define CONFIG_PCM_U24LE_DECODER 1
+%define CONFIG_PCM_U32BE_DECODER 1
+%define CONFIG_PCM_U32LE_DECODER 1
+%define CONFIG_PCM_ZORK_DECODER 1
+%define CONFIG_INTERPLAY_DPCM_DECODER 1
+%define CONFIG_ROQ_DPCM_DECODER 1
+%define CONFIG_SOL_DPCM_DECODER 1
+%define CONFIG_XAN_DPCM_DECODER 1
+%define CONFIG_ADPCM_4XM_DECODER 1
+%define CONFIG_ADPCM_ADX_DECODER 1
+%define CONFIG_ADPCM_CT_DECODER 1
+%define CONFIG_ADPCM_EA_DECODER 1
+%define CONFIG_ADPCM_EA_MAXIS_XA_DECODER 1
+%define CONFIG_ADPCM_EA_R1_DECODER 1
+%define CONFIG_ADPCM_EA_R2_DECODER 1
+%define CONFIG_ADPCM_EA_R3_DECODER 1
+%define CONFIG_ADPCM_EA_XAS_DECODER 1
+%define CONFIG_ADPCM_G722_DECODER 1
+%define CONFIG_ADPCM_G726_DECODER 1
+%define CONFIG_ADPCM_IMA_AMV_DECODER 1
+%define CONFIG_ADPCM_IMA_APC_DECODER 1
+%define CONFIG_ADPCM_IMA_DK3_DECODER 1
+%define CONFIG_ADPCM_IMA_DK4_DECODER 1
+%define CONFIG_ADPCM_IMA_EA_EACS_DECODER 1
+%define CONFIG_ADPCM_IMA_EA_SEAD_DECODER 1
+%define CONFIG_ADPCM_IMA_ISS_DECODER 1
+%define CONFIG_ADPCM_IMA_QT_DECODER 1
+%define CONFIG_ADPCM_IMA_SMJPEG_DECODER 1
+%define CONFIG_ADPCM_IMA_WAV_DECODER 1
+%define CONFIG_ADPCM_IMA_WS_DECODER 1
+%define CONFIG_ADPCM_MS_DECODER 1
+%define CONFIG_ADPCM_SBPRO_2_DECODER 1
+%define CONFIG_ADPCM_SBPRO_3_DECODER 1
+%define CONFIG_ADPCM_SBPRO_4_DECODER 1
+%define CONFIG_ADPCM_SWF_DECODER 1
+%define CONFIG_ADPCM_THP_DECODER 1
+%define CONFIG_ADPCM_VIMA_DECODER 1
+%define CONFIG_ADPCM_XA_DECODER 1
+%define CONFIG_ADPCM_YAMAHA_DECODER 1
+%define CONFIG_ASS_DECODER 1
+%define CONFIG_DVBSUB_DECODER 1
+%define CONFIG_DVDSUB_DECODER 1
+%define CONFIG_PGSSUB_DECODER 1
+%define CONFIG_SRT_DECODER 1
+%define CONFIG_XSUB_DECODER 1
+%define CONFIG_LIBFDK_AAC_DECODER 0
+%define CONFIG_LIBGSM_DECODER 0
+%define CONFIG_LIBGSM_MS_DECODER 0
+%define CONFIG_LIBILBC_DECODER 0
+%define CONFIG_LIBOPENCORE_AMRNB_DECODER 0
+%define CONFIG_LIBOPENCORE_AMRWB_DECODER 0
+%define CONFIG_LIBOPENJPEG_DECODER 0
+%define CONFIG_LIBOPUS_DECODER 0
+%define CONFIG_LIBSCHROEDINGER_DECODER 0
+%define CONFIG_LIBSPEEX_DECODER 0
+%define CONFIG_LIBVPX_VP8_DECODER 0
+%define CONFIG_LIBVPX_VP9_DECODER 0
+%define CONFIG_AAC_DEMUXER 1
+%define CONFIG_AC3_DEMUXER 1
+%define CONFIG_ADX_DEMUXER 1
+%define CONFIG_AEA_DEMUXER 1
+%define CONFIG_AIFF_DEMUXER 1
+%define CONFIG_AMR_DEMUXER 1
+%define CONFIG_ANM_DEMUXER 1
+%define CONFIG_APC_DEMUXER 1
+%define CONFIG_APE_DEMUXER 1
+%define CONFIG_ASF_DEMUXER 1
+%define CONFIG_ASS_DEMUXER 1
+%define CONFIG_AU_DEMUXER 1
+%define CONFIG_AVI_DEMUXER 1
+%define CONFIG_AVISYNTH_DEMUXER 0
+%define CONFIG_AVS_DEMUXER 1
+%define CONFIG_BETHSOFTVID_DEMUXER 1
+%define CONFIG_BFI_DEMUXER 1
+%define CONFIG_BINK_DEMUXER 1
+%define CONFIG_BMV_DEMUXER 1
+%define CONFIG_C93_DEMUXER 1
+%define CONFIG_CAF_DEMUXER 1
+%define CONFIG_CAVSVIDEO_DEMUXER 1
+%define CONFIG_CDG_DEMUXER 1
+%define CONFIG_CDXL_DEMUXER 1
+%define CONFIG_DAUD_DEMUXER 1
+%define CONFIG_DFA_DEMUXER 1
+%define CONFIG_DIRAC_DEMUXER 1
+%define CONFIG_DNXHD_DEMUXER 1
+%define CONFIG_DSICIN_DEMUXER 1
+%define CONFIG_DTS_DEMUXER 1
+%define CONFIG_DV_DEMUXER 1
+%define CONFIG_DXA_DEMUXER 1
+%define CONFIG_EA_DEMUXER 1
+%define CONFIG_EA_CDATA_DEMUXER 1
+%define CONFIG_EAC3_DEMUXER 1
+%define CONFIG_FFMETADATA_DEMUXER 1
+%define CONFIG_FILMSTRIP_DEMUXER 1
+%define CONFIG_FLAC_DEMUXER 1
+%define CONFIG_FLIC_DEMUXER 1
+%define CONFIG_FLV_DEMUXER 1
+%define CONFIG_FOURXM_DEMUXER 1
+%define CONFIG_G722_DEMUXER 1
+%define CONFIG_G723_1_DEMUXER 1
+%define CONFIG_GSM_DEMUXER 1
+%define CONFIG_GXF_DEMUXER 1
+%define CONFIG_H261_DEMUXER 1
+%define CONFIG_H263_DEMUXER 1
+%define CONFIG_H264_DEMUXER 1
+%define CONFIG_HEVC_DEMUXER 1
+%define CONFIG_HLS_DEMUXER 1
+%define CONFIG_HNM_DEMUXER 1
+%define CONFIG_IDCIN_DEMUXER 1
+%define CONFIG_IFF_DEMUXER 1
+%define CONFIG_ILBC_DEMUXER 1
+%define CONFIG_IMAGE2_DEMUXER 1
+%define CONFIG_IMAGE2PIPE_DEMUXER 1
+%define CONFIG_INGENIENT_DEMUXER 1
+%define CONFIG_IPMOVIE_DEMUXER 1
+%define CONFIG_ISS_DEMUXER 1
+%define CONFIG_IV8_DEMUXER 1
+%define CONFIG_IVF_DEMUXER 1
+%define CONFIG_JV_DEMUXER 1
+%define CONFIG_LATM_DEMUXER 1
+%define CONFIG_LMLM4_DEMUXER 1
+%define CONFIG_LXF_DEMUXER 1
+%define CONFIG_M4V_DEMUXER 1
+%define CONFIG_MATROSKA_DEMUXER 1
+%define CONFIG_MJPEG_DEMUXER 1
+%define CONFIG_MLP_DEMUXER 1
+%define CONFIG_MM_DEMUXER 1
+%define CONFIG_MMF_DEMUXER 1
+%define CONFIG_MOV_DEMUXER 1
+%define CONFIG_MP3_DEMUXER 1
+%define CONFIG_MPC_DEMUXER 1
+%define CONFIG_MPC8_DEMUXER 1
+%define CONFIG_MPEGPS_DEMUXER 1
+%define CONFIG_MPEGTS_DEMUXER 1
+%define CONFIG_MPEGTSRAW_DEMUXER 1
+%define CONFIG_MPEGVIDEO_DEMUXER 1
+%define CONFIG_MSNWC_TCP_DEMUXER 1
+%define CONFIG_MTV_DEMUXER 1
+%define CONFIG_MV_DEMUXER 1
+%define CONFIG_MVI_DEMUXER 1
+%define CONFIG_MXF_DEMUXER 1
+%define CONFIG_MXG_DEMUXER 1
+%define CONFIG_NC_DEMUXER 1
+%define CONFIG_NSV_DEMUXER 1
+%define CONFIG_NUT_DEMUXER 1
+%define CONFIG_NUV_DEMUXER 1
+%define CONFIG_OGG_DEMUXER 1
+%define CONFIG_OMA_DEMUXER 1
+%define CONFIG_PAF_DEMUXER 1
+%define CONFIG_PCM_ALAW_DEMUXER 1
+%define CONFIG_PCM_MULAW_DEMUXER 1
+%define CONFIG_PCM_F64BE_DEMUXER 1
+%define CONFIG_PCM_F64LE_DEMUXER 1
+%define CONFIG_PCM_F32BE_DEMUXER 1
+%define CONFIG_PCM_F32LE_DEMUXER 1
+%define CONFIG_PCM_S32BE_DEMUXER 1
+%define CONFIG_PCM_S32LE_DEMUXER 1
+%define CONFIG_PCM_S24BE_DEMUXER 1
+%define CONFIG_PCM_S24LE_DEMUXER 1
+%define CONFIG_PCM_S16BE_DEMUXER 1
+%define CONFIG_PCM_S16LE_DEMUXER 1
+%define CONFIG_PCM_S8_DEMUXER 1
+%define CONFIG_PCM_U32BE_DEMUXER 1
+%define CONFIG_PCM_U32LE_DEMUXER 1
+%define CONFIG_PCM_U24BE_DEMUXER 1
+%define CONFIG_PCM_U24LE_DEMUXER 1
+%define CONFIG_PCM_U16BE_DEMUXER 1
+%define CONFIG_PCM_U16LE_DEMUXER 1
+%define CONFIG_PCM_U8_DEMUXER 1
+%define CONFIG_PMP_DEMUXER 1
+%define CONFIG_PVA_DEMUXER 1
+%define CONFIG_QCP_DEMUXER 1
+%define CONFIG_R3D_DEMUXER 1
+%define CONFIG_RAWVIDEO_DEMUXER 1
+%define CONFIG_RL2_DEMUXER 1
+%define CONFIG_RM_DEMUXER 1
+%define CONFIG_ROQ_DEMUXER 1
+%define CONFIG_RPL_DEMUXER 1
+%define CONFIG_RSO_DEMUXER 1
+%define CONFIG_RTP_DEMUXER 1
+%define CONFIG_RTSP_DEMUXER 1
+%define CONFIG_SAP_DEMUXER 1
+%define CONFIG_SDP_DEMUXER 1
+%define CONFIG_SEGAFILM_DEMUXER 1
+%define CONFIG_SHORTEN_DEMUXER 1
+%define CONFIG_SIFF_DEMUXER 1
+%define CONFIG_SMACKER_DEMUXER 1
+%define CONFIG_SMJPEG_DEMUXER 1
+%define CONFIG_SMUSH_DEMUXER 1
+%define CONFIG_SOL_DEMUXER 1
+%define CONFIG_SOX_DEMUXER 1
+%define CONFIG_SPDIF_DEMUXER 1
+%define CONFIG_SRT_DEMUXER 1
+%define CONFIG_STR_DEMUXER 1
+%define CONFIG_SWF_DEMUXER 1
+%define CONFIG_TAK_DEMUXER 1
+%define CONFIG_THP_DEMUXER 1
+%define CONFIG_TIERTEXSEQ_DEMUXER 1
+%define CONFIG_TMV_DEMUXER 1
+%define CONFIG_TRUEHD_DEMUXER 1
+%define CONFIG_TTA_DEMUXER 1
+%define CONFIG_TXD_DEMUXER 1
+%define CONFIG_TTY_DEMUXER 1
+%define CONFIG_VC1_DEMUXER 1
+%define CONFIG_VC1T_DEMUXER 1
+%define CONFIG_VMD_DEMUXER 1
+%define CONFIG_VOC_DEMUXER 1
+%define CONFIG_VQF_DEMUXER 1
+%define CONFIG_W64_DEMUXER 1
+%define CONFIG_WAV_DEMUXER 1
+%define CONFIG_WC3_DEMUXER 1
+%define CONFIG_WSAUD_DEMUXER 1
+%define CONFIG_WSVQA_DEMUXER 1
+%define CONFIG_WTV_DEMUXER 1
+%define CONFIG_WV_DEMUXER 1
+%define CONFIG_XA_DEMUXER 1
+%define CONFIG_XMV_DEMUXER 1
+%define CONFIG_XWMA_DEMUXER 1
+%define CONFIG_YOP_DEMUXER 1
+%define CONFIG_YUV4MPEGPIPE_DEMUXER 1
+%define CONFIG_A64MULTI_ENCODER 1
+%define CONFIG_A64MULTI5_ENCODER 1
+%define CONFIG_ALIAS_PIX_ENCODER 1
+%define CONFIG_ASV1_ENCODER 1
+%define CONFIG_ASV2_ENCODER 1
+%define CONFIG_BMP_ENCODER 1
+%define CONFIG_CLJR_ENCODER 1
+%define CONFIG_COMFORTNOISE_ENCODER 1
+%define CONFIG_DNXHD_ENCODER 1
+%define CONFIG_DPX_ENCODER 1
+%define CONFIG_DVVIDEO_ENCODER 1
+%define CONFIG_FFV1_ENCODER 1
+%define CONFIG_FFVHUFF_ENCODER 1
+%define CONFIG_FLASHSV_ENCODER 1
+%define CONFIG_FLV_ENCODER 1
+%define CONFIG_GIF_ENCODER 1
+%define CONFIG_H261_ENCODER 1
+%define CONFIG_H263_ENCODER 1
+%define CONFIG_H263P_ENCODER 1
+%define CONFIG_HUFFYUV_ENCODER 1
+%define CONFIG_JPEGLS_ENCODER 1
+%define CONFIG_LJPEG_ENCODER 1
+%define CONFIG_MJPEG_ENCODER 1
+%define CONFIG_MPEG1VIDEO_ENCODER 1
+%define CONFIG_MPEG2VIDEO_ENCODER 1
+%define CONFIG_MPEG4_ENCODER 1
+%define CONFIG_MSMPEG4V2_ENCODER 1
+%define CONFIG_MSMPEG4V3_ENCODER 1
+%define CONFIG_PAM_ENCODER 1
+%define CONFIG_PBM_ENCODER 1
+%define CONFIG_PCX_ENCODER 1
+%define CONFIG_PGM_ENCODER 1
+%define CONFIG_PGMYUV_ENCODER 1
+%define CONFIG_PNG_ENCODER 1
+%define CONFIG_PPM_ENCODER 1
+%define CONFIG_PRORES_ENCODER 1
+%define CONFIG_QTRLE_ENCODER 1
+%define CONFIG_RAWVIDEO_ENCODER 1
+%define CONFIG_ROQ_ENCODER 1
+%define CONFIG_RV10_ENCODER 1
+%define CONFIG_RV20_ENCODER 1
+%define CONFIG_SGI_ENCODER 1
+%define CONFIG_SUNRAST_ENCODER 1
+%define CONFIG_SVQ1_ENCODER 1
+%define CONFIG_TARGA_ENCODER 1
+%define CONFIG_LIBTWOLAME_ENCODER 0
+%define CONFIG_TIFF_ENCODER 1
+%define CONFIG_UTVIDEO_ENCODER 1
+%define CONFIG_V210_ENCODER 1
+%define CONFIG_V410_ENCODER 1
+%define CONFIG_WMV1_ENCODER 1
+%define CONFIG_WMV2_ENCODER 1
+%define CONFIG_XBM_ENCODER 1
+%define CONFIG_XWD_ENCODER 1
+%define CONFIG_ZLIB_ENCODER 1
+%define CONFIG_ZMBV_ENCODER 1
+%define CONFIG_AAC_ENCODER 1
+%define CONFIG_AC3_ENCODER 1
+%define CONFIG_AC3_FIXED_ENCODER 1
+%define CONFIG_ALAC_ENCODER 1
+%define CONFIG_EAC3_ENCODER 1
+%define CONFIG_FLAC_ENCODER 1
+%define CONFIG_MP2_ENCODER 1
+%define CONFIG_NELLYMOSER_ENCODER 1
+%define CONFIG_RA_144_ENCODER 1
+%define CONFIG_VORBIS_ENCODER 1
+%define CONFIG_WMAV1_ENCODER 1
+%define CONFIG_WMAV2_ENCODER 1
+%define CONFIG_PCM_ALAW_ENCODER 1
+%define CONFIG_PCM_F32BE_ENCODER 1
+%define CONFIG_PCM_F32LE_ENCODER 1
+%define CONFIG_PCM_F64BE_ENCODER 1
+%define CONFIG_PCM_F64LE_ENCODER 1
+%define CONFIG_PCM_MULAW_ENCODER 1
+%define CONFIG_PCM_S8_ENCODER 1
+%define CONFIG_PCM_S16BE_ENCODER 1
+%define CONFIG_PCM_S16LE_ENCODER 1
+%define CONFIG_PCM_S24BE_ENCODER 1
+%define CONFIG_PCM_S24DAUD_ENCODER 1
+%define CONFIG_PCM_S24LE_ENCODER 1
+%define CONFIG_PCM_S32BE_ENCODER 1
+%define CONFIG_PCM_S32LE_ENCODER 1
+%define CONFIG_PCM_U8_ENCODER 1
+%define CONFIG_PCM_U16BE_ENCODER 1
+%define CONFIG_PCM_U16LE_ENCODER 1
+%define CONFIG_PCM_U24BE_ENCODER 1
+%define CONFIG_PCM_U24LE_ENCODER 1
+%define CONFIG_PCM_U32BE_ENCODER 1
+%define CONFIG_PCM_U32LE_ENCODER 1
+%define CONFIG_ROQ_DPCM_ENCODER 1
+%define CONFIG_ADPCM_ADX_ENCODER 1
+%define CONFIG_ADPCM_G722_ENCODER 1
+%define CONFIG_ADPCM_G726_ENCODER 1
+%define CONFIG_ADPCM_IMA_QT_ENCODER 1
+%define CONFIG_ADPCM_IMA_WAV_ENCODER 1
+%define CONFIG_ADPCM_MS_ENCODER 1
+%define CONFIG_ADPCM_SWF_ENCODER 1
+%define CONFIG_ADPCM_YAMAHA_ENCODER 1
+%define CONFIG_ASS_ENCODER 1
+%define CONFIG_DVBSUB_ENCODER 1
+%define CONFIG_DVDSUB_ENCODER 1
+%define CONFIG_XSUB_ENCODER 1
+%define CONFIG_LIBFAAC_ENCODER 0
+%define CONFIG_LIBFDK_AAC_ENCODER 0
+%define CONFIG_LIBGSM_ENCODER 0
+%define CONFIG_LIBGSM_MS_ENCODER 0
+%define CONFIG_LIBILBC_ENCODER 0
+%define CONFIG_LIBMP3LAME_ENCODER 0
+%define CONFIG_LIBOPENCORE_AMRNB_ENCODER 0
+%define CONFIG_LIBOPENJPEG_ENCODER 0
+%define CONFIG_LIBOPUS_ENCODER 0
+%define CONFIG_LIBSCHROEDINGER_ENCODER 0
+%define CONFIG_LIBSPEEX_ENCODER 0
+%define CONFIG_LIBTHEORA_ENCODER 0
+%define CONFIG_LIBVO_AACENC_ENCODER 0
+%define CONFIG_LIBVO_AMRWBENC_ENCODER 0
+%define CONFIG_LIBVORBIS_ENCODER 0
+%define CONFIG_LIBVPX_VP8_ENCODER 0
+%define CONFIG_LIBVPX_VP9_ENCODER 0
+%define CONFIG_LIBWAVPACK_ENCODER 0
+%define CONFIG_LIBWEBP_ENCODER 0
+%define CONFIG_LIBX264_ENCODER 0
+%define CONFIG_LIBX265_ENCODER 0
+%define CONFIG_LIBXAVS_ENCODER 0
+%define CONFIG_LIBXVID_ENCODER 0
+%define CONFIG_AFORMAT_FILTER 1
+%define CONFIG_AMIX_FILTER 1
+%define CONFIG_ANULL_FILTER 1
+%define CONFIG_ASETPTS_FILTER 1
+%define CONFIG_ASETTB_FILTER 1
+%define CONFIG_ASHOWINFO_FILTER 1
+%define CONFIG_ASPLIT_FILTER 1
+%define CONFIG_ASYNCTS_FILTER 1
+%define CONFIG_ATRIM_FILTER 1
+%define CONFIG_BS2B_FILTER 0
+%define CONFIG_CHANNELMAP_FILTER 1
+%define CONFIG_CHANNELSPLIT_FILTER 1
+%define CONFIG_COMPAND_FILTER 1
+%define CONFIG_JOIN_FILTER 1
+%define CONFIG_RESAMPLE_FILTER 1
+%define CONFIG_VOLUME_FILTER 1
+%define CONFIG_ANULLSRC_FILTER 1
+%define CONFIG_ANULLSINK_FILTER 1
+%define CONFIG_BLACKFRAME_FILTER 0
+%define CONFIG_BOXBLUR_FILTER 0
+%define CONFIG_COPY_FILTER 1
+%define CONFIG_CROP_FILTER 1
+%define CONFIG_CROPDETECT_FILTER 0
+%define CONFIG_DELOGO_FILTER 0
+%define CONFIG_DRAWBOX_FILTER 1
+%define CONFIG_DRAWTEXT_FILTER 0
+%define CONFIG_FADE_FILTER 1
+%define CONFIG_FIELDORDER_FILTER 1
+%define CONFIG_FORMAT_FILTER 1
+%define CONFIG_FPS_FILTER 1
+%define CONFIG_FRAMEPACK_FILTER 1
+%define CONFIG_FREI0R_FILTER 0
+%define CONFIG_GRADFUN_FILTER 1
+%define CONFIG_HFLIP_FILTER 1
+%define CONFIG_HQDN3D_FILTER 0
+%define CONFIG_INTERLACE_FILTER 0
+%define CONFIG_LUT_FILTER 1
+%define CONFIG_LUTRGB_FILTER 1
+%define CONFIG_LUTYUV_FILTER 1
+%define CONFIG_NEGATE_FILTER 1
+%define CONFIG_NOFORMAT_FILTER 1
+%define CONFIG_NULL_FILTER 1
+%define CONFIG_OCV_FILTER 0
+%define CONFIG_OVERLAY_FILTER 1
+%define CONFIG_PAD_FILTER 1
+%define CONFIG_PIXDESCTEST_FILTER 1
+%define CONFIG_SCALE_FILTER 1
+%define CONFIG_SELECT_FILTER 1
+%define CONFIG_SETDAR_FILTER 1
+%define CONFIG_SETPTS_FILTER 1
+%define CONFIG_SETSAR_FILTER 1
+%define CONFIG_SETTB_FILTER 1
+%define CONFIG_SHOWINFO_FILTER 1
+%define CONFIG_SHUFFLEPLANES_FILTER 1
+%define CONFIG_SPLIT_FILTER 1
+%define CONFIG_TRANSPOSE_FILTER 1
+%define CONFIG_TRIM_FILTER 1
+%define CONFIG_UNSHARP_FILTER 1
+%define CONFIG_VFLIP_FILTER 1
+%define CONFIG_YADIF_FILTER 1
+%define CONFIG_COLOR_FILTER 1
+%define CONFIG_FREI0R_SRC_FILTER 0
+%define CONFIG_MOVIE_FILTER 1
+%define CONFIG_NULLSRC_FILTER 1
+%define CONFIG_RGBTESTSRC_FILTER 1
+%define CONFIG_TESTSRC_FILTER 1
+%define CONFIG_NULLSINK_FILTER 1
+%define CONFIG_H263_VAAPI_HWACCEL 0
+%define CONFIG_H263_VDPAU_HWACCEL 0
+%define CONFIG_H264_DXVA2_HWACCEL 0
+%define CONFIG_H264_VAAPI_HWACCEL 0
+%define CONFIG_H264_VDA_HWACCEL 0
+%define CONFIG_H264_VDA_OLD_HWACCEL 0
+%define CONFIG_H264_VDPAU_HWACCEL 0
+%define CONFIG_MPEG1_VDPAU_HWACCEL 0
+%define CONFIG_MPEG2_DXVA2_HWACCEL 0
+%define CONFIG_MPEG2_VAAPI_HWACCEL 0
+%define CONFIG_MPEG2_VDPAU_HWACCEL 0
+%define CONFIG_MPEG4_VAAPI_HWACCEL 0
+%define CONFIG_MPEG4_VDPAU_HWACCEL 0
+%define CONFIG_VC1_DXVA2_HWACCEL 0
+%define CONFIG_VC1_VAAPI_HWACCEL 0
+%define CONFIG_VC1_VDPAU_HWACCEL 0
+%define CONFIG_WMV3_DXVA2_HWACCEL 0
+%define CONFIG_WMV3_VAAPI_HWACCEL 0
+%define CONFIG_WMV3_VDPAU_HWACCEL 0
+%define CONFIG_ALSA_INDEV 1
+%define CONFIG_BKTR_INDEV 0
+%define CONFIG_DV1394_INDEV 1
+%define CONFIG_FBDEV_INDEV 1
+%define CONFIG_JACK_INDEV 0
+%define CONFIG_OSS_INDEV 1
+%define CONFIG_PULSE_INDEV 0
+%define CONFIG_SNDIO_INDEV 0
+%define CONFIG_V4L2_INDEV 1
+%define CONFIG_VFWCAP_INDEV 0
+%define CONFIG_X11GRAB_INDEV 0
+%define CONFIG_LIBCDIO_INDEV 0
+%define CONFIG_LIBDC1394_INDEV 0
+%define CONFIG_A64_MUXER 1
+%define CONFIG_AC3_MUXER 1
+%define CONFIG_ADTS_MUXER 1
+%define CONFIG_ADX_MUXER 1
+%define CONFIG_AIFF_MUXER 1
+%define CONFIG_AMR_MUXER 1
+%define CONFIG_ASF_MUXER 1
+%define CONFIG_ASS_MUXER 1
+%define CONFIG_ASF_STREAM_MUXER 1
+%define CONFIG_AU_MUXER 1
+%define CONFIG_AVI_MUXER 1
+%define CONFIG_AVM2_MUXER 1
+%define CONFIG_CAVSVIDEO_MUXER 1
+%define CONFIG_CRC_MUXER 1
+%define CONFIG_DAUD_MUXER 1
+%define CONFIG_DIRAC_MUXER 1
+%define CONFIG_DNXHD_MUXER 1
+%define CONFIG_DTS_MUXER 1
+%define CONFIG_DV_MUXER 1
+%define CONFIG_EAC3_MUXER 1
+%define CONFIG_F4V_MUXER 1
+%define CONFIG_FFMETADATA_MUXER 1
+%define CONFIG_FILMSTRIP_MUXER 1
+%define CONFIG_FLAC_MUXER 1
+%define CONFIG_FLV_MUXER 1
+%define CONFIG_FRAMECRC_MUXER 1
+%define CONFIG_FRAMEMD5_MUXER 1
+%define CONFIG_G722_MUXER 1
+%define CONFIG_GIF_MUXER 1
+%define CONFIG_GXF_MUXER 1
+%define CONFIG_H261_MUXER 1
+%define CONFIG_H263_MUXER 1
+%define CONFIG_H264_MUXER 1
+%define CONFIG_HDS_MUXER 1
+%define CONFIG_HEVC_MUXER 1
+%define CONFIG_HLS_MUXER 1
+%define CONFIG_ILBC_MUXER 1
+%define CONFIG_IMAGE2_MUXER 1
+%define CONFIG_IMAGE2PIPE_MUXER 1
+%define CONFIG_IPOD_MUXER 1
+%define CONFIG_ISMV_MUXER 1
+%define CONFIG_IVF_MUXER 1
+%define CONFIG_LATM_MUXER 1
+%define CONFIG_M4V_MUXER 1
+%define CONFIG_MD5_MUXER 1
+%define CONFIG_MATROSKA_MUXER 1
+%define CONFIG_MATROSKA_AUDIO_MUXER 1
+%define CONFIG_MJPEG_MUXER 1
+%define CONFIG_MLP_MUXER 1
+%define CONFIG_MMF_MUXER 1
+%define CONFIG_MOV_MUXER 1
+%define CONFIG_MP2_MUXER 1
+%define CONFIG_MP3_MUXER 1
+%define CONFIG_MP4_MUXER 1
+%define CONFIG_MPEG1SYSTEM_MUXER 1
+%define CONFIG_MPEG1VCD_MUXER 1
+%define CONFIG_MPEG1VIDEO_MUXER 1
+%define CONFIG_MPEG2DVD_MUXER 1
+%define CONFIG_MPEG2SVCD_MUXER 1
+%define CONFIG_MPEG2VIDEO_MUXER 1
+%define CONFIG_MPEG2VOB_MUXER 1
+%define CONFIG_MPEGTS_MUXER 1
+%define CONFIG_MPJPEG_MUXER 1
+%define CONFIG_MXF_MUXER 1
+%define CONFIG_MXF_D10_MUXER 1
+%define CONFIG_NULL_MUXER 1
+%define CONFIG_NUT_MUXER 1
+%define CONFIG_OGG_MUXER 1
+%define CONFIG_OMA_MUXER 1
+%define CONFIG_PCM_ALAW_MUXER 1
+%define CONFIG_PCM_MULAW_MUXER 1
+%define CONFIG_PCM_F64BE_MUXER 1
+%define CONFIG_PCM_F64LE_MUXER 1
+%define CONFIG_PCM_F32BE_MUXER 1
+%define CONFIG_PCM_F32LE_MUXER 1
+%define CONFIG_PCM_S32BE_MUXER 1
+%define CONFIG_PCM_S32LE_MUXER 1
+%define CONFIG_PCM_S24BE_MUXER 1
+%define CONFIG_PCM_S24LE_MUXER 1
+%define CONFIG_PCM_S16BE_MUXER 1
+%define CONFIG_PCM_S16LE_MUXER 1
+%define CONFIG_PCM_S8_MUXER 1
+%define CONFIG_PCM_U32BE_MUXER 1
+%define CONFIG_PCM_U32LE_MUXER 1
+%define CONFIG_PCM_U24BE_MUXER 1
+%define CONFIG_PCM_U24LE_MUXER 1
+%define CONFIG_PCM_U16BE_MUXER 1
+%define CONFIG_PCM_U16LE_MUXER 1
+%define CONFIG_PCM_U8_MUXER 1
+%define CONFIG_PSP_MUXER 1
+%define CONFIG_RAWVIDEO_MUXER 1
+%define CONFIG_RM_MUXER 1
+%define CONFIG_ROQ_MUXER 1
+%define CONFIG_RSO_MUXER 1
+%define CONFIG_RTP_MUXER 1
+%define CONFIG_RTSP_MUXER 1
+%define CONFIG_SAP_MUXER 1
+%define CONFIG_SEGMENT_MUXER 1
+%define CONFIG_SMJPEG_MUXER 1
+%define CONFIG_SMOOTHSTREAMING_MUXER 1
+%define CONFIG_SOX_MUXER 1
+%define CONFIG_SPDIF_MUXER 1
+%define CONFIG_SRT_MUXER 1
+%define CONFIG_SWF_MUXER 1
+%define CONFIG_TG2_MUXER 1
+%define CONFIG_TGP_MUXER 1
+%define CONFIG_TRUEHD_MUXER 1
+%define CONFIG_VC1T_MUXER 1
+%define CONFIG_VOC_MUXER 1
+%define CONFIG_WAV_MUXER 1
+%define CONFIG_WEBM_MUXER 1
+%define CONFIG_WV_MUXER 1
+%define CONFIG_YUV4MPEGPIPE_MUXER 1
+%define CONFIG_ALSA_OUTDEV 1
+%define CONFIG_OSS_OUTDEV 1
+%define CONFIG_SNDIO_OUTDEV 0
+%define CONFIG_AAC_PARSER 1
+%define CONFIG_AAC_LATM_PARSER 1
+%define CONFIG_AC3_PARSER 1
+%define CONFIG_ADX_PARSER 1
+%define CONFIG_BMP_PARSER 1
+%define CONFIG_CAVSVIDEO_PARSER 1
+%define CONFIG_COOK_PARSER 1
+%define CONFIG_DCA_PARSER 1
+%define CONFIG_DIRAC_PARSER 1
+%define CONFIG_DNXHD_PARSER 1
+%define CONFIG_DVBSUB_PARSER 1
+%define CONFIG_DVDSUB_PARSER 1
+%define CONFIG_FLAC_PARSER 1
+%define CONFIG_GSM_PARSER 1
+%define CONFIG_H261_PARSER 1
+%define CONFIG_H263_PARSER 1
+%define CONFIG_H264_PARSER 1
+%define CONFIG_HEVC_PARSER 1
+%define CONFIG_MJPEG_PARSER 1
+%define CONFIG_MLP_PARSER 1
+%define CONFIG_MPEG4VIDEO_PARSER 1
+%define CONFIG_MPEGAUDIO_PARSER 1
+%define CONFIG_MPEGVIDEO_PARSER 1
+%define CONFIG_OPUS_PARSER 1
+%define CONFIG_PNG_PARSER 1
+%define CONFIG_PNM_PARSER 1
+%define CONFIG_RV30_PARSER 1
+%define CONFIG_RV40_PARSER 1
+%define CONFIG_TAK_PARSER 1
+%define CONFIG_VC1_PARSER 1
+%define CONFIG_VORBIS_PARSER 1
+%define CONFIG_VP3_PARSER 1
+%define CONFIG_VP8_PARSER 1
+%define CONFIG_CONCAT_PROTOCOL 1
+%define CONFIG_CRYPTO_PROTOCOL 1
+%define CONFIG_FFRTMPCRYPT_PROTOCOL 0
+%define CONFIG_FFRTMPHTTP_PROTOCOL 1
+%define CONFIG_FILE_PROTOCOL 1
+%define CONFIG_GOPHER_PROTOCOL 1
+%define CONFIG_HLS_PROTOCOL 1
+%define CONFIG_HTTP_PROTOCOL 1
+%define CONFIG_HTTPPROXY_PROTOCOL 1
+%define CONFIG_HTTPS_PROTOCOL 0
+%define CONFIG_MMSH_PROTOCOL 1
+%define CONFIG_MMST_PROTOCOL 1
+%define CONFIG_MD5_PROTOCOL 1
+%define CONFIG_PIPE_PROTOCOL 1
+%define CONFIG_RTMP_PROTOCOL 1
+%define CONFIG_RTMPE_PROTOCOL 0
+%define CONFIG_RTMPS_PROTOCOL 0
+%define CONFIG_RTMPT_PROTOCOL 1
+%define CONFIG_RTMPTE_PROTOCOL 0
+%define CONFIG_RTMPTS_PROTOCOL 0
+%define CONFIG_RTP_PROTOCOL 1
+%define CONFIG_SCTP_PROTOCOL 0
+%define CONFIG_SRTP_PROTOCOL 1
+%define CONFIG_TCP_PROTOCOL 1
+%define CONFIG_TLS_PROTOCOL 0
+%define CONFIG_UDP_PROTOCOL 1
+%define CONFIG_UNIX_PROTOCOL 1
+%define CONFIG_LIBRTMP_PROTOCOL 0
+%define CONFIG_LIBRTMPE_PROTOCOL 0
+%define CONFIG_LIBRTMPS_PROTOCOL 0
+%define CONFIG_LIBRTMPT_PROTOCOL 0
+%define CONFIG_LIBRTMPTE_PROTOCOL 0
diff --git a/src/expansion.asm b/src/expansion.asm
new file mode 100644
index 0000000..4ac77bf
--- /dev/null
+++ b/src/expansion.asm
@@ -0,0 +1,91 @@
+%include "x86util.asm"
+
+SECTION .text
+
+; len1 len2
+; compute vec2^T·mat·vec1 = ∑ ∑ mat[i, j] vec1[i] vec2[j]
+; i=1 j=1
+%macro SCALARPRODUCT_METRIC 0
+cglobal scalarproduct_metric, 5, 7, 7, len1, len2, mat, vec1, vec2, rowpos
+ shl len2q, 3
+ shl len1q, 3
+
+ add vec1q, len1q
+ add vec2q, len2q
+ neg len2q
+
+ lea r6, [3 * len1q]
+
+ xorpd m0, m0
+
+.loop_2
+ mov rowposq, len1q
+ neg rowposq
+
+ xorpd m1, m1
+ xorpd m2, m2
+
+%if mmsize == 32
+ xorpd m3, m3
+ xorpd m4, m4
+%endif
+
+.loop_1
+ mova m5, [vec1q + rowposq]
+
+%if mmsize == 32
+ FMULADD_PD m4, m5, [matq + r6q], m4, m6
+ FMULADD_PD m3, m5, [matq + 2 * len1q], m3, m6
+%endif
+
+ FMULADD_PD m2, m5, [matq + 1 * len1q], m2, m6
+ FMULADD_PD m1, m5, [matq + 0 * len1q], m1, m6
+
+ add matq, mmsize
+ add rowposq, mmsize
+ js .loop_1
+
+ haddpd m1, m2
+
+%if mmsize == 32
+ vextractf128 xmm2, ymm1, 1
+ addpd xmm1, xmm2
+
+ haddpd m3, m4
+ vextractf128 xmm4, ymm3, 1
+ addpd xmm3, xmm4
+
+ vinsertf128 ymm1, ymm1, xmm3, 1
+%endif
+
+ FMULADD_PD m0, m1, [vec2q + len2q], m0, m6
+
+%if mmsize == 32
+ add matq, r6
+%else
+ add matq, len1q
+%endif
+ add len2q, mmsize
+ js .loop_2
+
+ haddpd m0, m0
+
+%if mmsize == 32
+ vextractf128 xmm1, ymm0, 1
+ addpd xmm0, xmm1
+%endif
+
+ emms
+
+ RET
+%endmacro
+
+INIT_XMM sse3
+SCALARPRODUCT_METRIC
+
+INIT_YMM avx
+SCALARPRODUCT_METRIC
+
+INIT_YMM fma3
+SCALARPRODUCT_METRIC
+
diff --git a/src/gamma_freeze_template.c b/src/gamma_freeze_template.c
new file mode 100644
index 0000000..8edda4d
--- /dev/null
+++ b/src/gamma_freeze_template.c
@@ -0,0 +1,507 @@
+/*
+ * Minimal distortion -- template for the equations definitions
+ * Copyright (C) 2016 Anton Khirnov <anton@khirnov.net>
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#define FUNC3(a, b) a ## _ ## b
+#define FUNC2(a, b) FUNC3(a, b)
+#define FUNC(name) FUNC2(name, EQUATION)
+
+/**
+ * A template for calculating the equation coefficients.
+ */
+static void FUNC(calc_eq_coeffs)(void *arg,
+ unsigned int job_idx, unsigned int nb_jobs,
+ unsigned int thread_idx, unsigned int nb_threads)
+{
+ const MDCalcEqThread *et = arg;
+ const MDSolver *ctx = et->ctx;
+ MDEquationContext *eq_ctx = et->eq_ctx;
+
+ const int start = job_idx * et->block_size;
+ const int end = MIN((job_idx + 1) * et->block_size, NB_COLLOC_POINTS(ctx));
+
+ for (int i = start; i < end; i++) {
+ const double x = eq_ctx->interp_coords[0][i];
+ const double z = eq_ctx->interp_coords[2][i];
+ const int zaxis = x <= EPS;
+
+ double c1o3 = (1.0 / 3.0);
+
+ double gtu[3][3], g[3][3], gu[3][3];
+ double dg[3][3][3], d2g[3][3][3][3], dgu[3][3][3], dgtu[3][3][3], G[3][3][3], dG[3][3][3][3];
+ double Gt[3][3][3];
+ double dXt[3][3];
+ double A[3][3], Au[3][3], Atu[3][3];
+ double dA[3][3][3], dAu[3][3][3];
+ double Ric[3][3], Ricm[3][3];
+ double rhs_x, rhs_z;
+
+ const double gtxx = eq_ctx->interp_values[I_GTXX][i];
+ const double gtyy = eq_ctx->interp_values[I_GTYY][i];
+ const double gtzz = eq_ctx->interp_values[I_GTZZ][i];
+ const double gtxy = eq_ctx->interp_values[I_GTXY][i];
+ const double gtxz = eq_ctx->interp_values[I_GTXZ][i];
+ const double gtyz = eq_ctx->interp_values[I_GTYZ][i];
+
+ const double gt[3][3] = {{ gtxx, gtxy, gtxz },
+ { gtxy, gtyy, gtyz },
+ { gtxz, gtyz, gtzz }};
+
+ const double dx_gt11 = eq_ctx->interp_values[I_GTXX_DX][i];
+ const double dx_gt22 = eq_ctx->interp_values[I_GTYY_DX][i];
+ const double dx_gt33 = eq_ctx->interp_values[I_GTZZ_DX][i];
+ const double dx_gt13 = eq_ctx->interp_values[I_GTXZ_DX][i];
+
+ const double dz_gt11 = eq_ctx->interp_values[I_GTXX_DZ][i];
+ const double dz_gt22 = eq_ctx->interp_values[I_GTYY_DZ][i];
+ const double dz_gt33 = eq_ctx->interp_values[I_GTZZ_DZ][i];
+ const double dz_gt13 = eq_ctx->interp_values[I_GTXZ_DZ][i];
+
+ const double dgt[3][3][3] = {
+ {
+ { dx_gt11, 0.0, dx_gt13 },
+ { 0.0, dx_gt22, 0.0 },
+ { dx_gt13, 0.0, dx_gt33 },
+ },
+ {
+ { 0.0, zaxis ? dx_gt11 - dx_gt22 : (gtxx - gtyy) / x, 0.0 },
+ { zaxis ? dx_gt11 - dx_gt22 : (gtxx - gtyy) / x, 0.0, zaxis ? dx_gt13 : gtxz / x },
+ { 0.0, zaxis ? dx_gt13 : gtxz / x, 0.0 },
+ },
+ {
+ { dz_gt11, 0.0, dz_gt13 },
+ { 0.0, dz_gt22, 0.0 },
+ { dz_gt13, 0.0, dz_gt33 },
+ },
+ };
+
+ const double dxx_gt11 = eq_ctx->interp_values[I_GTXX_DXX][i];
+ const double dxx_gt22 = eq_ctx->interp_values[I_GTYY_DXX][i];
+ const double dxx_gt33 = eq_ctx->interp_values[I_GTZZ_DXX][i];
+ const double dxx_gt13 = eq_ctx->interp_values[I_GTXZ_DXX][i];
+
+ const double dxz_gt11 = eq_ctx->interp_values[I_GTXX_DXZ][i];
+ const double dxz_gt22 = eq_ctx->interp_values[I_GTYY_DXZ][i];
+ const double dxz_gt33 = eq_ctx->interp_values[I_GTZZ_DXZ][i];
+ const double dxz_gt13 = eq_ctx->interp_values[I_GTXZ_DXZ][i];
+
+ const double dzz_gt11 = eq_ctx->interp_values[I_GTXX_DZZ][i];
+ const double dzz_gt22 = eq_ctx->interp_values[I_GTYY_DZZ][i];
+ const double dzz_gt33 = eq_ctx->interp_values[I_GTZZ_DZZ][i];
+ const double dzz_gt13 = eq_ctx->interp_values[I_GTXZ_DZZ][i];
+
+ const double d2gt[3][3][3][3] = {
+ {
+ {
+ { dxx_gt11, 0.0, dxx_gt13 },
+ { 0.0, dxx_gt22, 0.0 },
+ { dxx_gt13, 0.0, dxx_gt33 },
+ },
+ {
+ { 0.0, zaxis ? 0.5 * (dxx_gt11 - dxx_gt22) : (dx_gt11 - dx_gt22) / x - (gtxx - gtyy) / SQR(x), 0.0 },
+ { zaxis ? 0.5 * (dxx_gt11 - dxx_gt22) : (dx_gt11 - dx_gt22) / x - (gtxx - gtyy) / SQR(x), 0.0,
+ zaxis ? 0.5 * dxx_gt13 : dx_gt13 / x - gtxz / SQR(x) },
+ { 0.0, zaxis ? 0.5 * dxx_gt13 : dx_gt13 / x - gtxz / SQR(x), 0.0 },
+ },
+ {
+ { dxz_gt11, 0.0, dxz_gt13 },
+ { 0.0, dxz_gt22, 0.0 },
+ { dxz_gt13, 0.0, dxz_gt33 },
+ },
+
+ },
+ {
+ {
+ { 0.0, zaxis ? 0.5 * (dxx_gt11 - dxx_gt22) : (dx_gt11 - dx_gt22) / x - (gtxx - gtyy) / SQR(x), 0.0 },
+ { zaxis ? 0.5 * (dxx_gt11 - dxx_gt22) : (dx_gt11 - dx_gt22) / x - (gtxx - gtyy) / SQR(x), 0.0,
+ zaxis ? 0.5 * dxx_gt13 : dx_gt13 / x - gtxz / SQR(x) },
+ { 0.0, zaxis ? 0.5 * dxx_gt13 : dx_gt13 / x - gtxz / SQR(x), 0.0 },
+ },
+ {
+ { zaxis ? dxx_gt22 : dx_gt11 / x - 2 * (gtxx - gtyy) / SQR(x), 0.0,
+ zaxis ? 0.5 * dxx_gt13 : dx_gt13 / x - gtxz / SQR(x) },
+ { 0.0, zaxis ? dxx_gt11 : dx_gt22 / x + 2.0 * (gtxx - gtyy) / SQR(x), 0.0 },
+ { zaxis ? 0.5 * dxx_gt13 : dx_gt13 / x - gtxz / SQR(x), 0.0, zaxis ? dxx_gt33 : dx_gt33 / x },
+ },
+ {
+ { 0.0, zaxis ? dxz_gt11 - dxz_gt22 : (dz_gt11 - dz_gt22) / x, 0.0 },
+ { zaxis ? dxz_gt11 - dxz_gt22 : (dz_gt11 - dz_gt22) / x, 0.0,
+ zaxis ? dxz_gt13 : dz_gt13 / x },
+ { 0.0, zaxis ? dxz_gt13 : dz_gt13 / x, 0.0 },
+ },
+
+ },
+ {
+ {
+ { dxz_gt11, 0.0, dxz_gt13 },
+ { 0.0, dxz_gt22, 0.0 },
+ { dxz_gt13, 0.0, dxz_gt33 },
+ },
+ {
+ { 0.0, zaxis ? dxz_gt11 - dxz_gt22 : (dz_gt11 - dz_gt22) / x, 0.0 },
+ { zaxis ? dxz_gt11 - dxz_gt22 : (dz_gt11 - dz_gt22) / x, 0.0,
+ zaxis ? dxz_gt13 : dz_gt13 / x },
+ { 0.0, zaxis ? dxz_gt13 : dz_gt13 / x, 0.0 },
+ },
+ {
+ { dzz_gt11, 0.0, dzz_gt13 },
+ { 0.0, dzz_gt22, 0.0 },
+ { dzz_gt13, 0.0, dzz_gt33 },
+ },
+
+ },
+ };
+
+ const double Atxx = eq_ctx->interp_values[I_ATXX][i];
+ const double Atyy = eq_ctx->interp_values[I_ATYY][i];
+ const double Atzz = eq_ctx->interp_values[I_ATZZ][i];
+ const double Atxy = eq_ctx->interp_values[I_ATXY][i];
+ const double Atxz = eq_ctx->interp_values[I_ATXZ][i];
+ const double Atyz = eq_ctx->interp_values[I_ATYZ][i];
+
+ const double trK = eq_ctx->interp_values[I_TRK][i];
+
+ const double dx_trK = eq_ctx->interp_values[I_TRK_DX][i];
+ const double dz_trK = eq_ctx->interp_values[I_TRK_DZ][i];
+
+ const double dtrK[3] = { dx_trK, 0.0, dz_trK };
+
+ const double dx_At11 = eq_ctx->interp_values[I_ATXX_DX][i];
+ const double dx_At22 = eq_ctx->interp_values[I_ATYY_DX][i];
+ const double dx_At33 = eq_ctx->interp_values[I_ATZZ_DX][i];
+ const double dx_At13 = eq_ctx->interp_values[I_ATXZ_DX][i];
+
+ const double dz_At11 = eq_ctx->interp_values[I_ATXX_DZ][i];
+ const double dz_At22 = eq_ctx->interp_values[I_ATYY_DZ][i];
+ const double dz_At33 = eq_ctx->interp_values[I_ATZZ_DZ][i];
+ const double dz_At13 = eq_ctx->interp_values[I_ATXZ_DZ][i];
+
+ const double dAt[3][3][3] = {
+ {
+ { dx_At11, 0.0, dx_At13 },
+ { 0.0, dx_At22, 0.0 },
+ { dx_At13, 0.0, dx_At33 },
+ },
+ {
+ { 0.0, zaxis ? dx_At11 - dx_At22 : (Atxx - Atyy) / x, 0.0 },
+ { zaxis ? dx_At11 - dx_At22 : (Atxx - Atyy) / x, 0.0, zaxis ? dx_At13 : Atxz / x },
+ { 0.0, zaxis ? dx_At13 : Atxz / x, 0.0 },
+ },
+ {
+ { dz_At11, 0.0, dz_At13 },
+ { 0.0, dz_At22, 0.0 },
+ { dz_At13, 0.0, dz_At33 },
+ },
+ };
+
+ const double phi = eq_ctx->interp_values[I_PHI][i];
+
+ const double phi_dx = eq_ctx->interp_values[I_PHI_DX][i];
+ const double phi_dz = eq_ctx->interp_values[I_PHI_DZ][i];
+
+ const double dphi[3] = { phi_dx, 0.0, phi_dz };
+
+ const double phi_dxx = eq_ctx->interp_values[I_PHI_DXX][i];
+ const double phi_dzz = eq_ctx->interp_values[I_PHI_DZZ][i];
+ const double phi_dxz = eq_ctx->interp_values[I_PHI_DXZ][i];
+
+ const double d2phi[3][3] = {
+ { phi_dxx, 0.0, phi_dxz },
+ { 0.0, zaxis ? phi_dxx : phi_dx / x, 0.0 },
+ { phi_dxz, 0.0, phi_dzz },
+ };
+
+ const double At[3][3] = {{ Atxx, Atxy, Atxz },
+ { Atxy, Atyy, Atyz },
+ { Atxz, Atyz, Atzz }};
+
+ const double alpha = eq_ctx->interp_values[I_ALPHA][i];
+ const double dx_alpha = eq_ctx->interp_values[I_ALPHA_DX][i];
+ const double dz_alpha = eq_ctx->interp_values[I_ALPHA_DZ][i];
+
+ const double dalpha[3] = { dx_alpha, 0.0, dz_alpha };
+
+ const double Xtx = eq_ctx->interp_values[I_XTX][i];
+ const double Xtz = eq_ctx->interp_values[I_XTZ][i];
+
+ const double Xt[3] = { Xtx, 0.0, Xtz };
+
+ const double det = gtxx * gtyy * gtzz + 2 * gtxy * gtyz * gtxz - gtzz * SQR(gtxy) - SQR(gtxz) * gtyy - gtxx * SQR(gtyz);
+
+ // \tilde{γ}^{ij}
+ gtu[0][0] = (gtyy * gtzz - SQR(gtyz)) / det;
+ gtu[1][1] = (gtxx * gtzz - SQR(gtxz)) / det;
+ gtu[2][2] = (gtxx * gtyy - SQR(gtxy)) / det;
+ gtu[0][1] = -(gtxy * gtzz - gtyz * gtxz) / det;
+ gtu[0][2] = (gtxy * gtyz - gtyy * gtxz) / det;
+ gtu[1][2] = -(gtxx * gtyz - gtxy * gtxz) / det;
+ gtu[1][0] = gtu[0][1];
+ gtu[2][0] = gtu[0][2];
+ gtu[2][1] = gtu[1][2];
+
+ // γ_{jk}/^{jk}
+ for (int j = 0; j < 3; j++)
+ for (int k = 0; k < 3; k++) {
+ gu[j][k] = SQR(phi) * gtu[j][k];
+ g[j][k] = gt[j][k] / SQR(phi);
+ }
+
+ // ∂_j γ_{kl}
+ for (int j = 0; j < 3; j++)
+ for (int k = 0; k < 3; k++)
+ for (int l = 0; l < 3; l++) {
+ dg[j][k][l] = -2.0 * dphi[j] * gt[k][l] / (phi * SQR(phi)) + dgt[j][k][l] / SQR(phi);
+ dA[j][k][l] = -2.0 * dphi[j] * At[k][l] / (phi * SQR(phi)) + dAt[j][k][l] / SQR(phi);
+ }
+
+ // ∂_j \tilde{γ}^{kl}
+ for (int j = 0; j < 3; j++)
+ for (int k = 0; k < 3; k++)
+ for (int l = 0; l < 3; l++) {
+ double val = 0.0;
+ for (int m = 0; m < 3; m++)
+ for (int n = 0; n < 3; n++)
+ val += -gtu[k][m] * gtu[l][n] * dgt[j][m][n];
+ dgtu[j][k][l] = val;
+ }
+
+ // ∂_j γ^{kl}
+ for (int j = 0; j < 3; j++)
+ for (int k = 0; k < 3; k++)
+ for (int l = 0; l < 3; l++) {
+ double val = 0.0;
+ for (int m = 0; m < 3; m++)
+ for (int n = 0; n < 3; n++)
+ val += -gu[k][m] * gu[l][n] * dg[j][m][n];
+ dgu[j][k][l] = val;
+ }
+
+ // ∂_{jk} g_{lm}
+ for (int j = 0; j < 3; j++)
+ for (int k = 0; k < 3; k++)
+ for (int l = 0; l < 3; l++)
+ for (int m = 0; m < 3; m++) {
+ d2g[j][k][l][m] = 6.0 * gt [l][m] * dphi[j] * dphi[k] / SQR(SQR(phi)) -
+ 2.0 * gt [l][m] * d2phi[j][k] / (phi * SQR(phi)) -
+ 2.0 * dgt [j][l][m] * dphi[k] / (phi * SQR(phi)) -
+ 2.0 * dgt [k][l][m] * dphi[j] / (phi * SQR(phi)) +
+ d2gt[j][k][l][m] / SQR(phi);
+ }
+
+ // \tilde{Γ}^j_{kl}
+ for (int j = 0; j < 3; j++)
+ for (int k = 0; k < 3; k++)
+ for (int l = 0; l < 3; l++) {
+ double val = 0.0;
+ for (int m = 0; m < 3; m++)
+ val += 0.5 * gtu[j][m] * (dgt[k][l][m] + dgt[l][k][m] - dgt[m][k][l]);
+ Gt[j][k][l] = val;
+ }
+
+ // Γ^j_{kl}
+ for (int j = 0; j < 3; j++)
+ for (int k = 0; k < 3; k++)
+ for (int l = 0; l < 3; l++) {
+ double val = 0.0;
+ for (int m = 0; m < 3; m++)
+ val += 0.5 * gu[j][m] * (dg[k][l][m] + dg[l][k][m] - dg[m][k][l]);
+ G[j][k][l] = val;
+ }
+
+ // ∂_j Γ^k_{lm}
+ for (int j = 0; j < 3; j++)
+ for (int k = 0; k < 3; k++)
+ for (int l = 0; l < 3; l++)
+ for (int m = 0; m < 3; m++) {
+ double val = 0.0;
+ for (int n = 0; n < 3; n++) {
+ val += dgu[j][k][n] * (dg [l][m][n] + dg [m][l][n] - dg [n][l][m]) +
+ gu [k][n] * (d2g[j][l][m][n] + d2g[j][m][l][n] - d2g[j][n][l][m]);
+ }
+ dG[j][k][l][m] = 0.5 * val;
+ }
+
+ // ∂_j Γ^k
+ for (int j = 0; j < 3; j++)
+ for (int k = 0; k < 3; k++) {
+ double val = 0.0;
+ for (int l = 0; l < 3; l++)
+ for (int m = 0; m < 3; m++)
+ val += gtu[l][m] * dG[j][k][l][m] + dgtu[j][l][m] * G[k][l][m];
+ dXt[j][k] = val;
+ }
+
+ // Ric_{jk}
+ for (int j = 0; j < 3; j++)
+ for (int k = 0; k < 3; k++) {
+ double val = 0.0;
+ for (int m = 0; m < 3; m++)
+ val += dG[m][m][j][k] - dG[k][m][j][m];
+ for (int m = 0; m < 3; m++)
+ for (int l = 0; l < 3; l++)
+ val += G[l][l][m] * G[m][j][k] - G[l][k][m] * G[m][j][l];
+ Ric[j][k] = val;
+ }
+
+ // Ric^j_k
+ for (int j = 0; j < 3; j++)
+ for (int k = 0; k < 3; k++) {
+ double val = 0.0;
+ for (int l = 0; l < 3; l++)
+ val += gu[j][l] * Ric[l][k];
+ Ricm[j][k] = val;
+ }
+
+ // A_{jk}
+ for (int j = 0; j < 3; j++)
+ for (int k = 0; k < 3; k++) {
+ A[j][k] = At[j][k] / SQR(phi);
+ }
+
+ // d_j A^{kl}
+ for (int j = 0; j < 3; j++)
+ for (int k = 0; k < 3; k++)
+ for (int l = 0; l < 3; l++) {
+ double val = 0.0;
+ for (int m = 0; m < 3; m++)
+ for (int n = 0; n < 3; n++)
+ val += dgu[j][k][m] * gu[l][n] * A[m][n] + gu[k][m] * dgu[j][l][n] * A[m][n] + gu[k][m] * gu[l][n] * dA[j][m][n];
+ dAu[j][k][l] = val;
+ }
+
+ // \tilde{A}^{jk}
+ for (int j = 0; j < 3; j++)
+ for (int k = 0; k < 3; k++) {
+ double val = 0.0;
+ for (int l = 0; l < 3; l++)
+ for (int m = 0; m < 3; m++)
+ val += gtu[j][l] * gtu[k][m] * At[l][m];
+ Atu[j][k] = val;
+ }
+
+ // A^{jk}
+ for (int j = 0; j < 3; j++)
+ for (int k = 0; k < 3; k++) {
+ double val = 0.0;
+ for (int l = 0; l < 3; l++)
+ for (int m = 0; m < 3; m++)
+ val += gu[j][l] * gu[k][m] * A[l][m];
+ Au[j][k] = val;
+ }
+
+ rhs_x = 0.0;
+ rhs_z = 0.0;
+ for (int j = 0; j < 3; j++) {
+ rhs_x += dalpha[j] * Atu[0][j];
+ rhs_z += dalpha[j] * Atu[2][j];
+ }
+ double val_x = 0.0;
+ double val_z = 0.0;
+ for (int j = 0; j < 3; j++) {
+ for (int k = 0; k < 3; k++) {
+ val_x += -Gt[0][j][k] * Atu[j][k];
+ val_z += -Gt[2][j][k] * Atu[j][k];
+ }
+ }
+ rhs_x += val_x * alpha;
+ rhs_z += val_z * alpha;
+ for (int j = 0; j < 3; j++) {
+ rhs_x += alpha * (2.0 / 3.0) * gtu[0][j] * dtrK[j];
+ rhs_z += alpha * (2.0 / 3.0) * gtu[2][j] * dtrK[j];
+ }
+ for (int j = 0; j < 3; j++) {
+ rhs_x += alpha * 3.0 * Atu[0][j] * dphi[j]/ phi;
+ rhs_z += alpha * 3.0 * Atu[2][j] * dphi[j]/ phi;
+ }
+
+ rhs_x *= 2.0;
+ rhs_z *= 2.0;
+
+ double X[3] = { 0.0 };
+ for (int j = 0; j < 3; j++)
+ for (int k = 0; k < 3; k++) {
+ X[0] += gu[j][k] * G[0][j][k];
+ X[2] += gu[j][k] * G[2][j][k];
+ }
+
+ if (EQUATION == 0) {
+ /* eq 0 */
+ /* ∂_{xx}β^x */
+ eq_ctx->eq_coeffs[0][PSSOLVE_DIFF_ORDER_20][i] = gtu[0][0] + c1o3 * gtu[0][0] + (zaxis ? 0.5 * (gtu[1][1] + c1o3 * gtu[0][0]) : 0.0);
+ /* ∂_{xx}β^z */
+ eq_ctx->eq_coeffs[1][PSSOLVE_DIFF_ORDER_20][i] = 0.0;
+ /* ∂_{zz}β^x */
+ eq_ctx->eq_coeffs[0][PSSOLVE_DIFF_ORDER_02][i] = gtu[2][2];
+ /* ∂_{zz}β^z */
+ eq_ctx->eq_coeffs[1][PSSOLVE_DIFF_ORDER_02][i] = c1o3 * gtu[0][2];
+
+ /* ∂_{xz}β^x */
+ eq_ctx->eq_coeffs[0][PSSOLVE_DIFF_ORDER_11][i] = 2.0 * gtu[0][2] + c1o3 * gtu[0][2] + (zaxis ? c1o3 * gtu[0][2] : 0.0);
+ /* ∂_{xz}β^z */
+ eq_ctx->eq_coeffs[1][PSSOLVE_DIFF_ORDER_11][i] = c1o3 * gtu[0][0];
+
+ /* ∂_{x}β^x */
+ eq_ctx->eq_coeffs[0][PSSOLVE_DIFF_ORDER_10][i] = -Xt[0] + (2.0 / 3.0) * Xt[0] + (zaxis ? (2.0 / 3.0) * Xt[0] : (gtu[1][1] + c1o3 * gtu[0][0]) / x);
+ /* ∂_{x}β^z */
+ eq_ctx->eq_coeffs[1][PSSOLVE_DIFF_ORDER_10][i] = 0.0;
+
+ /* ∂_{z}β^x */
+ eq_ctx->eq_coeffs[0][PSSOLVE_DIFF_ORDER_01][i] = -Xt[2] + (zaxis ? 0.0 : c1o3 * gtu[0][2] / x);
+ /* ∂_{z}β^z */
+ eq_ctx->eq_coeffs[1][PSSOLVE_DIFF_ORDER_01][i] = (2.0 / 3.0) * Xt[0];
+
+ /* β^x */
+ eq_ctx->eq_coeffs[0][PSSOLVE_DIFF_ORDER_00][i] = dXt[0][0] + (zaxis ? 0.0 : (2.0 / 3.0) * Xt[0] / x - (gtu[1][1] + c1o3 * gtu[0][0]) / SQR(x));
+
+ /* β^z */
+ eq_ctx->eq_coeffs[1][PSSOLVE_DIFF_ORDER_00][i] = dXt[2][0];
+
+ eq_ctx->rhs[i] = rhs_x;
+ } else {
+ /* eq 1 */
+ /* ∂_{xx}β^x */
+ eq_ctx->eq_coeffs[0][PSSOLVE_DIFF_ORDER_20][i] = c1o3 * gtu[2][0] + (zaxis ? 0.5 * c1o3 * gtu[2][0] : 0.0);
+ /* ∂_{xx}β^z */
+ eq_ctx->eq_coeffs[1][PSSOLVE_DIFF_ORDER_20][i] = gtu[0][0] + (zaxis ? gtu[1][1] : 0.0);
+ /* ∂_{zz}β^x */
+ eq_ctx->eq_coeffs[0][PSSOLVE_DIFF_ORDER_02][i] = 0.0;
+ /* ∂_{zz}β^z */
+ eq_ctx->eq_coeffs[1][PSSOLVE_DIFF_ORDER_02][i] = gtu[2][2] + c1o3 * gtu[2][2];
+ /* ∂_{xz}β^x */
+ eq_ctx->eq_coeffs[0][PSSOLVE_DIFF_ORDER_11][i] = c1o3 * gtu[2][2] + (zaxis ? c1o3 * gtu[2][2] : 0.0);
+ /* ∂_{xz}β^z */
+ eq_ctx->eq_coeffs[1][PSSOLVE_DIFF_ORDER_11][i] = 2.0 * gtu[0][2] + c1o3 * gtu[0][2];
+
+ /* ∂_{x}β^x */
+ eq_ctx->eq_coeffs[0][PSSOLVE_DIFF_ORDER_10][i] = (2.0 / 3.0) * Xt[2] + (zaxis ? (2.0 / 3.0) * Xt[2] : c1o3 * gtu[2][0] / x);
+ /* ∂_{x}β^z */
+ eq_ctx->eq_coeffs[1][PSSOLVE_DIFF_ORDER_10][i] = -Xt[0] + (zaxis ? 0.0 : gtu[1][1] / x);
+ /* ∂_{z}β^x */
+ eq_ctx->eq_coeffs[0][PSSOLVE_DIFF_ORDER_01][i] = (zaxis ? 0.0 : c1o3 * gtu[2][2] / x);
+ /* ∂_{z}β^z */
+ eq_ctx->eq_coeffs[1][PSSOLVE_DIFF_ORDER_01][i] = -Xt[2] + (2.0 / 3.0) * Xt[2];
+
+ /* β^x */
+ eq_ctx->eq_coeffs[0][PSSOLVE_DIFF_ORDER_00][i] = dXt[0][2] + (zaxis ? 0.0 : (2.0 / 3.0) * Xt[2] / x - c1o3 * gtu[2][0] / SQR(x));
+
+ /* β^z */
+ eq_ctx->eq_coeffs[1][PSSOLVE_DIFF_ORDER_00][i] = dXt[2][2];
+
+ eq_ctx->rhs[i] = rhs_z;
+ }
+ }
+}
diff --git a/src/make.code.defn b/src/make.code.defn
new file mode 100644
index 0000000..cc89085
--- /dev/null
+++ b/src/make.code.defn
@@ -0,0 +1,7 @@
+# Main make.code.defn file for thorn MaximalSlicingAxi
+
+# Source files in this directory
+SRCS = basis.c bicgstab.c md.c md_solve.c pssolve.c expansion.asm threadpool.c register.c
+
+# Subdirectories containing source files
+SUBDIRS =
diff --git a/src/md.c b/src/md.c
new file mode 100644
index 0000000..21e38fc
--- /dev/null
+++ b/src/md.c
@@ -0,0 +1,573 @@
+#include "common.h"
+
+#include <ctype.h>
+#include <errno.h>
+#include <float.h>
+#include <inttypes.h>
+#include <limits.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <cblas.h>
+
+#include "cctk.h"
+#include "cctk_Arguments.h"
+#include "cctk_Parameters.h"
+#include "cctk_Timers.h"
+#include "util_Table.h"
+
+#include "md.h"
+#include "md_solve.h"
+#include "threadpool.h"
+
+typedef struct EvalContext {
+ struct MDContext *md;
+ struct CoordPatch *cp;
+ const double *x;
+ const double *z;
+ double *W;
+
+ const double *coeffs;
+ double nb_coeffs[2];
+
+ double *eval_tmp[2];
+
+ unsigned int x_idx_start;
+ unsigned int x_idx_end;
+ unsigned int z_idx_start;
+ unsigned int z_idx_end;
+} EvalContext;
+
+/* precomputed values for a given refined grid */
+typedef struct CoordPatch {
+ CCTK_REAL origin[3];
+ CCTK_INT delta[3];
+ CCTK_INT size[3];
+
+ // basis values on the grid
+ double *basis_val_r;
+ double *basis_val_z;
+
+ double *transform_z;
+ double *transform_matrix;
+ double *transform_matrix1;
+ double *transform_matrix2;
+ double *transform_matrix3;
+ double *transform_tmp;
+
+ int y_idx;
+
+ int nb_threads;
+ ThreadPoolContext *tp;
+ EvalContext *ec;
+} CoordPatch;
+
+struct MDContext {
+ MDSolver *solver;
+ cGH *gh;
+ ThreadPoolContext *tp;
+
+ struct {
+ double time;
+ double *coeffs;
+ } solution_cache[8];
+ int nb_solutions;
+
+ double *coeffs_eval;
+
+ uint64_t grid_expand_count;
+ uint64_t grid_expand_time;
+
+ CoordPatch *patches;
+ int nb_patches;
+};
+
+/* get an approximate "main" frequency component in a basis function */
+static double calc_basis_freq(const MDBasisSetContext *b, int order)
+{
+ return md_basis_colloc_point(b, order, 1);
+}
+
+static CoordPatch *get_coord_patch(MDContext *md,
+ CCTK_REAL *x, CCTK_REAL *y, CCTK_REAL *z,
+ double scale_factor, double scale_power)
+{
+ cGH *cctkGH = md->gh;
+
+ CoordPatch *cp;
+ int64_t grid_size;
+ int i, block_size;
+ const char *nb_threads;
+
+ for (int i = 0; i < md->nb_patches; i++) {
+ cp = &md->patches[i];
+
+ if (cp->origin[0] == md->gh->cctk_origin_space[0] &&
+ cp->origin[1] == md->gh->cctk_origin_space[1] &&
+ cp->origin[2] == md->gh->cctk_origin_space[2] &&
+ cp->size[0] == md->gh->cctk_lsh[0] &&
+ cp->size[1] == md->gh->cctk_lsh[1] &&
+ cp->size[2] == md->gh->cctk_lsh[2] &&
+ cp->delta[0] == md->gh->cctk_levfac[0] &&
+ cp->delta[1] == md->gh->cctk_levfac[1] &&
+ cp->delta[2] == md->gh->cctk_levfac[2])
+ return cp;
+ }
+
+ grid_size = cctkGH->cctk_lsh[0] * cctkGH->cctk_lsh[1] * cctkGH->cctk_lsh[2];
+
+ /* create a new patch */
+ md->patches = realloc(md->patches, sizeof(*md->patches) * (md->nb_patches + 1));
+ cp = &md->patches[md->nb_patches];
+
+ memset(cp, 0, sizeof(*cp));
+
+ memcpy(cp->origin, md->gh->cctk_origin_space, sizeof(cp->origin));
+ memcpy(cp->size, md->gh->cctk_lsh, sizeof(cp->size));
+ memcpy(cp->delta, md->gh->cctk_levfac, sizeof(cp->delta));
+
+ for (i = 0; i < cp->size[1]; i++)
+ if (fabs(y[CCTK_GFINDEX3D(cctkGH, 0, i, 0)]) < 1e-8) {
+ cp->y_idx = i;
+ break;
+ }
+ if (i == cp->size[1])
+ CCTK_WARN(0, "The grid does not include y==0");
+
+#if MD_POLAR || 1
+ posix_memalign((void**)&cp->transform_matrix, 32, sizeof(*cp->transform_matrix) * md->solver->nb_coeffs[0] * cp->size[0] * cp->size[2]);
+ posix_memalign((void**)&cp->transform_matrix1, 32, sizeof(*cp->transform_matrix1) * md->solver->nb_coeffs[1] * cp->size[0] * cp->size[2]);
+ posix_memalign((void**)&cp->transform_matrix2, 32, sizeof(*cp->transform_matrix2) * md->solver->nb_coeffs[0] * cp->size[0] * cp->size[2]);
+ posix_memalign((void**)&cp->transform_matrix3, 32, sizeof(*cp->transform_matrix3) * md->solver->nb_coeffs[1] * cp->size[0] * cp->size[2]);
+#pragma omp parallel for
+ for (int j = 0; j < cp->size[2]; j++) {
+ double zz = z[CCTK_GFINDEX3D(md->gh, 0, 0, j)];
+
+ for (int i = 0; i < cp->size[0]; i++) {
+ const int idx_grid = j * cp->size[0] + i;
+
+ double xx = x[CCTK_GFINDEX3D(md->gh, i, 0, 0)];
+ double rr = sqrt(SQR(xx) + SQR(zz));
+
+ double coord0 = xx;
+ double coord1 = zz;
+
+ //for (int k = 0; k < md->nb_coeffs_z; k++)
+ // for (int l = 0; l < md->nb_coeffs_x; l++) {
+ // const int idx_coeff = k * md->nb_coeffs_x + l;
+ // cp->transform_matrix[idx_grid + cp->size[0] * cp->size[2] * idx_coeff] = md->basis->eval(r, l) * md->basis1->eval(phi, k);
+ // }
+ for (int k = 0; k < md->solver->nb_coeffs[0]; k++) {
+ double dx = calc_basis_freq(md->solver->basis[0][0], k);
+ double r0 = MIN(60.0, dx * scale_factor);
+ double fact = exp(-36.0 * pow(rr / r0, scale_power));
+
+ cp->transform_matrix[idx_grid + cp->size[0] * cp->size[2] * k] = md_basis_eval(md->solver->basis[0][0], MD_BASIS_EVAL_TYPE_VALUE, coord0, k) * fact;
+ }
+ for (int k = 0; k < md->solver->nb_coeffs[1]; k++) {
+ double dx = calc_basis_freq(md->solver->basis[0][1], k);
+ double r0 = MIN(60.0, dx * scale_factor);
+ double fact = exp(-36.0 * pow(rr / r0, scale_power));
+
+ cp->transform_matrix1[idx_grid * md->solver->nb_coeffs[1] + k] = md_basis_eval(md->solver->basis[0][1], MD_BASIS_EVAL_TYPE_VALUE, coord1, k) * fact;
+ }
+ for (int k = 0; k < md->solver->nb_coeffs[0]; k++) {
+ double dx = calc_basis_freq(md->solver->basis[1][0], k);
+ double r0 = MIN(60.0, dx * scale_factor);
+ double fact = exp(-36.0 * pow(rr / r0, scale_power));
+
+ cp->transform_matrix2[idx_grid + cp->size[0] * cp->size[2] * k] = md_basis_eval(md->solver->basis[1][0], MD_BASIS_EVAL_TYPE_VALUE, coord0, k) * fact;
+ }
+ for (int k = 0; k < md->solver->nb_coeffs[1]; k++) {
+ double dx = calc_basis_freq(md->solver->basis[1][1], k);
+ double r0 = MIN(60.0, dx * scale_factor);
+ double fact = exp(-36.0 * pow(rr / r0, scale_power));
+
+ cp->transform_matrix3[idx_grid * md->solver->nb_coeffs[1] + k] = md_basis_eval(md->solver->basis[1][1], MD_BASIS_EVAL_TYPE_VALUE, coord1, k) * fact;
+ }
+ }
+ }
+ posix_memalign((void**)&cp->transform_tmp, 32, sizeof(*cp->transform_tmp) * cp->size[0] * cp->size[2] * md->solver->nb_coeffs[1]);
+#else
+ posix_memalign((void**)&cp->basis_val_r, 32, sizeof(*cp->basis_val_r) * md->solver->nb_coeffs[0] * md->gh->cctk_lsh[1] * md->gh->cctk_lsh[0]);
+ for (int j = 0; j < md->gh->cctk_lsh[1]; j++)
+ for (int i = 0; i < md->gh->cctk_lsh[0]; i++) {
+ CCTK_REAL xx = x[CCTK_GFINDEX3D(md->gh, i, j, 0)];
+ CCTK_REAL yy = y[CCTK_GFINDEX3D(md->gh, i, j, 0)];
+ CCTK_REAL r = sqrt(SQR(xx) + SQR(yy));
+
+ for (int k = 0; k < md->solver->nb_coeffs[0]; k++)
+ //cp->basis_val_r [(j * md->gh->cctk_lsh[0] + i) * md->nb_coeffs_x + k] = md->basis->eval(r, k);
+ cp->basis_val_r [(j * md->gh->cctk_lsh[0] + i) + md->gh->cctk_lsh[1] * md->gh->cctk_lsh[0] * k] = md->solver->basis[0]->eval(r, k);
+ }
+
+ posix_memalign((void**)&cp->basis_val_z, 32, sizeof(*cp->basis_val_z) * md->solver->nb_coeffs[1] * md->gh->cctk_lsh[2]);
+ for (int i = 0; i < md->gh->cctk_lsh[2]; i++) {
+ CCTK_REAL zz = z[CCTK_GFINDEX3D(md->gh, 0, 0, i)];
+ for (int j = 0; j < md->solver->nb_coeffs[1]; j++)
+ cp->basis_val_z [i * md->solver->nb_coeffs[1] + j] = md->solver->basis[0]->eval(fabs(zz), j);
+ //cp->basis_val_z [i + md->gh->cctk_lsh[2] * j] = md->basis->eval(zz, j);
+ }
+ posix_memalign((void**)&cp->transform_z, 32, sizeof(*cp->transform_z) * cctkGH->cctk_lsh[2] * md->solver->nb_coeffs[0]);
+#endif
+
+#if 0
+ nb_threads = getenv("OMP_NUM_THREADS");
+ if (nb_threads)
+ cp->nb_threads = atoi(nb_threads);
+ if (cp->nb_threads <= 0)
+ cp->nb_threads = 1;
+ md_threadpool_init(&cp->tp, cp->nb_threads);
+ cp->ec = calloc(cp->nb_threads, sizeof(*cp->ec));
+
+ block_size = (md->gh->cctk_lsh[2] + cp->nb_threads - 1) / cp->nb_threads;
+
+ for (int i = 0; i < cp->nb_threads; i++) {
+ EvalContext *ec = &cp->ec[i];
+
+ ec->md = md;
+
+ ec->nb_coeffs[0] = md->solver->nb_coeffs[0];
+ ec->nb_coeffs[1] = md->solver->nb_coeffs[1];
+
+ posix_memalign((void**)&ec->eval_tmp[0], 32, sizeof(*ec->eval_tmp[0]) * ec->nb_coeffs[0]);
+ posix_memalign((void**)&ec->eval_tmp[1], 32, sizeof(*ec->eval_tmp[1]) * ec->nb_coeffs[1]);
+
+ ec->x_idx_start = 0;
+ ec->x_idx_end = md->gh->cctk_lsh[0];
+
+ ec->z_idx_start = block_size * i;
+ ec->z_idx_end = MIN(block_size * (i + 1), md->gh->cctk_lsh[2]);
+ }
+#endif
+
+ md->nb_patches++;
+ return cp;
+}
+
+static MDContext *md_context;
+
+static int context_init(cGH *cctkGH)
+{
+ int threads_type;
+ const int *threads = CCTK_ParameterGet("num_threads", "Carpet", &threads_type);
+
+ MDContext *md;
+ int ret;
+
+ DECLARE_CCTK_ARGUMENTS;
+ DECLARE_CCTK_PARAMETERS;
+
+ md = calloc(1, sizeof(*md));
+ if (!md)
+ return -ENOMEM;
+
+ md->gh = cctkGH;
+
+ ret = md_threadpool_init(&md->tp, *threads);
+ if (ret < 0)
+ return ret;
+
+ ret = md_solver_init(&md->solver, cctkGH, md->tp, 2,
+ (unsigned int [2][2]){ { basis_order_r, basis_order_z },
+ { basis_order_r, basis_order_z }},
+ scale_factor, filter_power, 0.0);
+ if (ret < 0)
+ return ret;
+
+ ret = posix_memalign((void**)&md->coeffs_eval, 32,
+ basis_order_r * basis_order_z * sizeof(*md->coeffs_eval));
+ if (ret)
+ return -ENOMEM;
+
+ for (int i = 0; i < ARRAY_ELEMS(md->solution_cache); i++) {
+ ret = posix_memalign((void**)&md->solution_cache[i].coeffs, 32,
+ 2 * basis_order_r * basis_order_z * sizeof(*md->solution_cache[i].coeffs));
+ if (ret)
+ return -ENOMEM;
+ }
+
+ md_context = md;
+
+ return 0;
+}
+
+void minimal_distortion_solve(CCTK_ARGUMENTS)
+{
+ MDContext *md;
+
+ DECLARE_CCTK_ARGUMENTS;
+ DECLARE_CCTK_PARAMETERS;
+
+ double time;
+
+ if (!md_context)
+ context_init(cctkGH);
+
+ md = md_context;
+
+ time = cctkGH->cctk_time / md->gh->cctk_delta_time;
+
+ //if (md->gh->cctk_levfac[0] != 1 || fabs(time - ceilf(time)) > 1e-8 ||
+ // (md->nb_solutions && md->solution_cache[md->nb_solutions - 1].time == cctkGH->cctk_time))
+ // return;
+ //if (md->gh->cctk_time < 10.0)
+ // return;
+
+ CCTK_TimerStart("MinimalDistortion_Solve");
+ md_solver_solve(md->solver);
+ CCTK_TimerStop("MinimalDistortion_Solve");
+
+ fprintf(stderr, "%d md solve: time %g %g %g\n", md->gh->cctk_levfac[0], md->gh->cctk_time, time, md->solver->coeffs[0]);
+ if (1) {
+ double *tmp;
+ if (md->nb_solutions == ARRAY_ELEMS(md->solution_cache)) {
+ tmp = md->solution_cache[0].coeffs;
+ memmove(md->solution_cache, md->solution_cache + 1, sizeof(md->solution_cache[0]) * (ARRAY_ELEMS(md->solution_cache) - 1));
+ } else {
+ md->nb_solutions++;
+ tmp = md->solution_cache[md->nb_solutions - 1].coeffs;
+ }
+ md->solution_cache[md->nb_solutions - 1].coeffs = md->solver->coeffs;
+ md->solution_cache[md->nb_solutions - 1].time = md->gh->cctk_time;
+
+ md->solver->coeffs = tmp;
+ }
+}
+
+double md_scalarproduct_metric_avx(size_t len1, size_t len2, const double *mat,
+ const double *vec1, const double *vec2);
+
+static double md_scalarproduct_metric_c(size_t len1, size_t len2, double *mat,
+ double *vec1, double *vec2)
+{
+ double val = 0.0;
+ for (int l = 0; l < len2; l++) {
+ double tmp = 0.0;
+ for (int m = 0; m < len1; m++)
+ tmp += mat[l * len1 + m] * vec1[m];
+
+ val += tmp * vec2[l];
+ }
+ return val;
+}
+
+#if 0
+static void md_eval(void *arg,
+ unsigned int job_id, unsigned int nb_jobs,
+ unsigned int thread_idx, unsigned int nb_threads)
+{
+ EvalContext *e = (EvalContext*)arg + job_id;
+ CoordPatch *cp = e->cp;
+ MDContext *md = e->md;
+ const cGH *gh = e->md->gh;
+ double *W = e->W;
+
+ for (int k = e->z_idx_start; k < e->z_idx_end; k++) {
+ for (int i = e->x_idx_start; i < e->x_idx_end; i++) {
+ int idx = CCTK_GFINDEX3D(gh, i, cp->y_idx, k);
+ double xx = e->x[idx];
+ double zz = e->z[idx];
+ double r = sqrt(SQR(xx) + SQR(zz));
+ double phi = atan2(zz, xx);
+
+ double *basis_vec1 = e->eval_tmp[0];
+ double *basis_vec2 = e->eval_tmp[1];
+
+ for (int l = 0; l < e->nb_coeffs[0]; l++)
+ basis_vec1[l] = md->solver->basis[0]->eval(r, l);
+ for (int l = 0; l < e->nb_coeffs[0]; l++)
+ basis_vec2[l] = md->solver->basis[1]->eval(phi, l);
+
+ W[idx] = md_scalarproduct_metric_avx(e->nb_coeffs[0], e->nb_coeffs[1], e->coeffs,
+ basis_vec1, basis_vec2);
+ }
+ }
+}
+#endif
+
+void minimal_distortion_eval(CCTK_ARGUMENTS)
+{
+ MDContext *md;
+
+ CoordPatch *cp;
+
+ DECLARE_CCTK_ARGUMENTS;
+ DECLARE_CCTK_PARAMETERS;
+
+ double *beta1 = CCTK_VarDataPtr(cctkGH, 0, "ML_BSSN::beta1");
+ double *beta3 = CCTK_VarDataPtr(cctkGH, 0, "ML_BSSN::beta3");
+
+ double time;
+
+ int64_t expand_start;
+
+ double *coeffs = NULL;
+ int i, ret;
+
+ if (!md_context)
+ context_init(cctkGH);
+
+ time = cctkGH->cctk_time;
+
+ md = md_context;
+
+ cp = get_coord_patch(md, x, y, z, scale_factor, scale_power);
+
+#if 1
+ //coeffs = md->coeffs;
+ coeffs = md->solution_cache[md->nb_solutions - 1].coeffs;
+#elif 0
+ if (time < 10.0) {
+ return;
+ } else if (time < 11.0) {
+ double fact = exp(-36.0 * pow((10.0 - time), 4.0));
+ double *coeffs_src = md->solution_cache[md->nb_solutions - 1].coeffs;
+
+ coeffs = md->coeffs_eval;
+ for (int i = 0; i < md->solver->nb_coeffs[0] * md->solver->nb_coeffs[1] * 2; i++)
+ coeffs[i] = coeffs_src[i] * fact;
+ } else
+ coeffs = md->solution_cache[md->nb_solutions - 1].coeffs;
+
+#else
+ coeffs = md->coeffs_eval;
+
+ if (cctkGH->cctk_levfac[0] < 1 || md->nb_solutions < 2) {
+ memset(coeffs, 0, sizeof(*coeffs) * md->solver->nb_coeffs[0] * md->solver->nb_coeffs[1]);
+ //fprintf(stderr, "md eval: time %g zero\n", md->gh->cctk_time);
+ } else {
+ double *coeffs0 = md->solution_cache[md->nb_solutions - 2].coeffs;
+ double *coeffs1 = md->solution_cache[md->nb_solutions - 1].coeffs;
+ double time0 = md->solution_cache[md->nb_solutions - 2].time;
+ double time1 = md->solution_cache[md->nb_solutions - 1].time;
+
+ double fact = 1.0;
+
+ //if (time < 9.0)
+ // fact = 1.0;
+ //else
+ // fact = exp(-36.0 * pow((time - 9.0), 4.0));
+ //else if (time < 0.1)
+ // fact = 0.0;
+ //else
+ // fact = (1.0 - exp(-pow((time - 0.0) / 0.25, 4.0)));
+ //fact = 1.0;
+
+ //fprintf(stderr, "md eval: time %g interp from %g %g %g\n", md->gh->cctk_time, time0, time1, fact);
+
+ for (int i = 0; i < 2 * md->solver->nb_coeffs[0] * md->solver->nb_coeffs[1]; i++)
+ coeffs[i] = (coeffs1[i] * (time - time0) / (time1 - time0) + coeffs0[i] * (time - time1) / (time0 - time1)) * fact;
+
+ }
+#endif
+
+ if (export_coeffs) {
+ memcpy(betax_coeffs, coeffs, sizeof(*coeffs) * md->solver->nb_coeffs[0] * md->solver->nb_coeffs[1]);
+ memcpy(betaz_coeffs, coeffs + md->solver->nb_coeffs[0] * md->solver->nb_coeffs[1],
+ sizeof(*coeffs) * md->solver->nb_coeffs[0] * md->solver->nb_coeffs[1]);
+ }
+
+ CCTK_TimerStart("MinimalDistortion_Expand");
+ expand_start = gettime();
+#if 0
+#pragma omp parallel for
+ for (int k = 0; k < cctk_lsh[2]; k++) {
+ for (int i = 0; i < cctk_lsh[0]; i++) {
+ int idx = CCTK_GFINDEX3D(cctkGH, i, cp->y_idx, k);
+ double xx = x[idx];
+ double zz = z[idx];
+ double r = sqrt(SQR(xx) + SQR(zz));
+ double phi = atan2(zz, xx);
+
+ double val = 0.0;
+
+ for (int l = 0; l < md->nb_coeffs_z; l++) {
+ double tmp = 0.0;
+ for (int m = 0; m < md->nb_coeffs_x; m++) {
+ const int idx_coeff = l * md->nb_coeffs_x + m;
+ tmp += coeffs[idx_coeff] * md->basis->eval(r, m);
+ }
+ val += tmp * md->basis1->eval(phi, l);
+ }
+
+ W[idx] = val;
+ }
+ }
+#elif 0
+ {
+ for (int i = 0; i < cp->nb_threads; i++) {
+ cp->ec[i].cp = cp;
+ cp->ec[i].x = x;
+ cp->ec[i].z = z;
+ cp->ec[i].W = W;
+ cp->ec[i].coeffs = coeffs;
+ }
+ md_threadpool_execute(cp->tp, cp->nb_threads, md_eval, cp->ec);
+ }
+#elif MD_POLAR || 1
+ cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans,
+ cctk_lsh[0] * cctk_lsh[2], md->solver->nb_coeffs[1], md->solver->nb_coeffs[0],
+ 1.0, cp->transform_matrix, cctk_lsh[0] * cctk_lsh[2],
+ coeffs, md->solver->nb_coeffs[0], 0.0, cp->transform_tmp, cctk_lsh[0] * cctk_lsh[2]);
+#pragma omp parallel for
+ for (int j = 0; j < cctk_lsh[2]; j++)
+ for (int i = 0; i < cctk_lsh[0]; i++) {
+ const int idx_grid = j * cctk_lsh[0] + i;
+ const double val = cblas_ddot(md->solver->nb_coeffs[1], cp->transform_matrix1 + idx_grid * md->solver->nb_coeffs[1], 1,
+ cp->transform_tmp + idx_grid, cctk_lsh[0] * cctk_lsh[2]);
+ beta1[CCTK_GFINDEX3D(cctkGH, i, cp->y_idx, j)] = val;
+ }
+ cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans,
+ cctk_lsh[0] * cctk_lsh[2], md->solver->nb_coeffs[1], md->solver->nb_coeffs[0],
+ 1.0, cp->transform_matrix2, cctk_lsh[0] * cctk_lsh[2],
+ coeffs + md->solver->nb_coeffs[0] * md->solver->nb_coeffs[1],
+ md->solver->nb_coeffs[0], 0.0, cp->transform_tmp, cctk_lsh[0] * cctk_lsh[2]);
+#pragma omp parallel for
+ for (int j = 0; j < cctk_lsh[2]; j++)
+ for (int i = 0; i < cctk_lsh[0]; i++) {
+ const int idx_grid = j * cctk_lsh[0] + i;
+ const double val = cblas_ddot(md->solver->nb_coeffs[1], cp->transform_matrix3 + idx_grid * md->solver->nb_coeffs[1], 1,
+ cp->transform_tmp + idx_grid, cctk_lsh[0] * cctk_lsh[2]);
+ beta3[CCTK_GFINDEX3D(cctkGH, i, cp->y_idx, j)] = val;
+ }
+#else
+ memset(W, 0, cctk_lsh[0] * cctk_lsh[1] * cctk_lsh[2] * sizeof(*W));
+ cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans,
+ md->solver->nb_coeffs[0], cctk_lsh[2], md->solver->nb_coeffs[1], 1.0,
+ coeffs, md->solver->nb_coeffs[0], cp->basis_val_z, md->solver->nb_coeffs[1],
+ 0.0, cp->transform_z, md->solver->nb_coeffs[0]);
+ cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans,
+ cctk_lsh[1] * cctk_lsh[0], cctk_lsh[2], md->solver->nb_coeffs[0], 1.0,
+ cp->basis_val_r, cctk_lsh[0] * cctk_lsh[1], cp->transform_z, md->solver->nb_coeffs[0],
+ 1.0, W, cctk_lsh[0] * cctk_lsh[1]);
+#endif
+
+ md->grid_expand_time += gettime() - expand_start;
+ md->grid_expand_count++;
+
+ CCTK_TimerStop("MinimalDistortion_Expand");
+
+ /* print stats */
+ if (!(md->grid_expand_count & 255)) {
+ fprintf(stderr, "Minimal distortion stats:\n");
+
+ md_solver_print_stats(md->solver);
+
+ fprintf(stderr,
+ "%lu evals: total time %g s, avg time per call %g md\n",
+ md->grid_expand_count, (double)md->grid_expand_time / 1e6,
+ (double)md->grid_expand_time / md->grid_expand_count / 1e3);
+ }
+}
+
+void minimal_distortion_init(CCTK_ARGUMENTS)
+{
+ DECLARE_CCTK_ARGUMENTS;
+ DECLARE_CCTK_PARAMETERS;
+
+ if (!md_context)
+ context_init(cctkGH);
+}
diff --git a/src/md.h b/src/md.h
new file mode 100644
index 0000000..0a4a917
--- /dev/null
+++ b/src/md.h
@@ -0,0 +1,19 @@
+#ifndef MD_MD_H
+#define MD_MD_H
+
+#include "common.h"
+
+#if HAVE_OPENCL
+#include <cl.h>
+#endif
+
+#include <inttypes.h>
+
+#include "cctk.h"
+
+#include "md_solve.h"
+#include "threadpool.h"
+
+typedef struct MDContext MDContext;
+
+#endif /* MD_MD_H */
diff --git a/src/md_solve.c b/src/md_solve.c
new file mode 100644
index 0000000..c7fa329
--- /dev/null
+++ b/src/md_solve.c
@@ -0,0 +1,818 @@
+/*
+ * Minimal distortion -- actual solver code
+ * Copyright (C) 2016 Anton Khirnov <anton@khirnov.net>
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "common.h"
+
+#include <errno.h>
+#include <math.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#if HAVE_OPENCL
+#include <cl.h>
+#include <clBLAS.h>
+#endif
+
+#include "cctk.h"
+#include "cctk_Timers.h"
+#include "util_Table.h"
+
+#include "basis.h"
+#include "pssolve.h"
+#include "md_solve.h"
+#include "threadpool.h"
+
+#define NB_COEFFS(md) (md->nb_coeffs[0] * md->nb_coeffs[1])
+#define NB_COLLOC_POINTS(md) (md->nb_colloc_points[0] * md->nb_colloc_points[1])
+
+/* indices (in our code, not cactus structs) of the grid functions which we'll need to
+ * interpolate on the pseudospectral grid */
+enum MetricVars {
+ GTXX = 0,
+ GTYY,
+ GTZZ,
+ GTXY,
+ GTXZ,
+ GTYZ,
+ PHI,
+ ATXX,
+ ATYY,
+ ATZZ,
+ ATXY,
+ ATXZ,
+ ATYZ,
+ XTX,
+ XTY,
+ XTZ,
+ ALPHA,
+ TRK,
+ NB_METRIC_VARS,
+};
+
+/* indices of the interpolated values of the above grid functions and their derivatives */
+enum InterpMetricVars {
+ I_GTXX = 0,
+ I_GTYY,
+ I_GTZZ,
+ I_GTXY,
+ I_GTXZ,
+ I_GTYZ,
+ I_GTXX_DX,
+ I_GTYY_DX,
+ I_GTZZ_DX,
+ I_GTXZ_DX,
+ I_GTXX_DZ,
+ I_GTYY_DZ,
+ I_GTZZ_DZ,
+ I_GTXZ_DZ,
+ I_GTXX_DXX,
+ I_GTYY_DXX,
+ I_GTZZ_DXX,
+ I_GTXZ_DXX,
+ I_GTXX_DXZ,
+ I_GTYY_DXZ,
+ I_GTZZ_DXZ,
+ I_GTXZ_DXZ,
+ I_GTXX_DZZ,
+ I_GTYY_DZZ,
+ I_GTZZ_DZZ,
+ I_GTXZ_DZZ,
+ I_PHI,
+ I_PHI_DX,
+ I_PHI_DY,
+ I_PHI_DZ,
+ I_PHI_DXX,
+ I_PHI_DZZ,
+ I_PHI_DXZ,
+ I_ATXX,
+ I_ATYY,
+ I_ATZZ,
+ I_ATXY,
+ I_ATXZ,
+ I_ATYZ,
+ I_ATXX_DX,
+ I_ATYY_DX,
+ I_ATZZ_DX,
+ I_ATXZ_DX,
+ I_ATXX_DZ,
+ I_ATYY_DZ,
+ I_ATZZ_DZ,
+ I_ATXZ_DZ,
+ I_XTX,
+ I_XTY,
+ I_XTZ,
+ I_ALPHA,
+ I_ALPHA_DX,
+ I_ALPHA_DY,
+ I_ALPHA_DZ,
+ I_TRK,
+ I_TRK_DX,
+ I_TRK_DZ,
+ NB_INTERP_VARS,
+};
+
+/* per-equation state */
+typedef struct MDEquationContext {
+ double *interp_coords[3];
+ double *interp_values[NB_INTERP_VARS];
+
+ /* eq_coeffs[i][j] is an array of coefficients at the collocation points
+ * for j-th derivative of i-th unknown function */
+ double *(*eq_coeffs)[PSSOLVE_DIFF_ORDER_NB];
+
+ double *rhs;
+} MDEquationContext;
+
+struct MDSolverPriv {
+ PSSolveContext *ps_ctx;
+ cGH *gh;
+
+ MDEquationContext *eqs;
+
+ int colloc_grid_order[2];
+
+ double *rhs;
+
+ double *coeff_scale;
+
+ // interpolation parameters
+ int coord_system;
+ int interp_operator;
+ int interp_params;
+
+ CCTK_REAL *interp_coords[3];
+
+ int interp_vars_indices[NB_METRIC_VARS];
+ CCTK_REAL *interp_values[NB_INTERP_VARS];
+ CCTK_INT interp_value_codes[NB_INTERP_VARS];
+
+#if HAVE_OPENCL
+ // OpenCL / CLBLAS stuff
+ cl_context ocl_ctx;
+ cl_command_queue ocl_queue;
+#endif
+
+ ThreadPoolContext *tp;
+ ThreadPoolContext *tp_internal;
+
+ uint64_t solve_count;
+ uint64_t solve_time;
+
+ uint64_t interp_geometry_count;
+ uint64_t interp_geometry_time;
+
+ uint64_t calc_eq_coeffs_count;
+ uint64_t calc_eq_coeffs_time;
+};
+
+typedef struct MDCalcEqThread {
+ MDSolver *ctx;
+ MDEquationContext *eq_ctx;
+ size_t block_size;
+} MDCalcEqThread;
+
+/* mapping between our indices and thorn names */
+static const char *metric_vars[] = {
+ [GTXX] = "ML_BSSN::gt11",
+ [GTYY] = "ML_BSSN::gt22",
+ [GTZZ] = "ML_BSSN::gt33",
+ [GTXY] = "ML_BSSN::gt12",
+ [GTXZ] = "ML_BSSN::gt13",
+ [GTYZ] = "ML_BSSN::gt23",
+ [ATXX] = "ML_BSSN::At11",
+ [ATYY] = "ML_BSSN::At22",
+ [ATZZ] = "ML_BSSN::At33",
+ [ATXY] = "ML_BSSN::At12",
+ [ATXZ] = "ML_BSSN::At13",
+ [ATYZ] = "ML_BSSN::At23",
+ [PHI] = "ML_BSSN::phi",
+ [XTX] = "ML_BSSN::Xt1",
+ [XTY] = "ML_BSSN::Xt2",
+ [XTZ] = "ML_BSSN::Xt3",
+ [ALPHA] = "ML_BSSN::alpha",
+ [TRK] = "ML_BSSN::trK",
+};
+
+/* mapping between the cactus grid values and interpolated values */
+static const CCTK_INT interp_operation_indices[] = {
+ [I_GTXX] = GTXX,
+ [I_GTYY] = GTYY,
+ [I_GTZZ] = GTZZ,
+ [I_GTXY] = GTXY,
+ [I_GTXZ] = GTXZ,
+ [I_GTYZ] = GTYZ,
+ [I_GTXX_DX] = GTXX,
+ [I_GTYY_DX] = GTYY,
+ [I_GTZZ_DX] = GTZZ,
+ [I_GTXZ_DX] = GTXZ,
+ [I_GTXX_DZ] = GTXX,
+ [I_GTYY_DZ] = GTYY,
+ [I_GTZZ_DZ] = GTZZ,
+ [I_GTXZ_DZ] = GTXZ,
+ [I_GTXX_DXX] = GTXX,
+ [I_GTYY_DXX] = GTYY,
+ [I_GTZZ_DXX] = GTZZ,
+ [I_GTXZ_DXX] = GTXZ,
+ [I_GTXX_DXZ] = GTXX,
+ [I_GTYY_DXZ] = GTYY,
+ [I_GTZZ_DXZ] = GTZZ,
+ [I_GTXZ_DXZ] = GTXZ,
+ [I_GTXX_DZZ] = GTXX,
+ [I_GTYY_DZZ] = GTYY,
+ [I_GTZZ_DZZ] = GTZZ,
+ [I_GTXZ_DZZ] = GTXZ,
+ [I_PHI] = PHI,
+ [I_PHI_DX] = PHI,
+ [I_PHI_DY] = PHI,
+ [I_PHI_DZ] = PHI,
+ [I_PHI_DXX] = PHI,
+ [I_PHI_DZZ] = PHI,
+ [I_PHI_DXZ] = PHI,
+ [I_ATXX] = ATXX,
+ [I_ATYY] = ATYY,
+ [I_ATZZ] = ATZZ,
+ [I_ATXY] = ATXY,
+ [I_ATXZ] = ATXZ,
+ [I_ATYZ] = ATYZ,
+ [I_ATXX_DX] = ATXX,
+ [I_ATYY_DX] = ATYY,
+ [I_ATZZ_DX] = ATZZ,
+ [I_ATXZ_DX] = ATXZ,
+ [I_ATXX_DZ] = ATXX,
+ [I_ATYY_DZ] = ATYY,
+ [I_ATZZ_DZ] = ATZZ,
+ [I_ATXZ_DZ] = ATXZ,
+ [I_XTX] = XTX,
+ [I_XTY] = XTY,
+ [I_XTZ] = XTZ,
+ [I_ALPHA] = ALPHA,
+ [I_ALPHA_DX] = ALPHA,
+ [I_ALPHA_DY] = ALPHA,
+ [I_ALPHA_DZ] = ALPHA,
+ [I_TRK] = TRK,
+ [I_TRK_DX] = TRK,
+ [I_TRK_DZ] = TRK,
+};
+
+/* the operation (plain value or x/y/z-derivative) to apply during interpolation */
+static const CCTK_INT interp_operation_codes[] = {
+ [I_GTXX] = 0,
+ [I_GTYY] = 0,
+ [I_GTZZ] = 0,
+ [I_GTXY] = 0,
+ [I_GTXZ] = 0,
+ [I_GTYZ] = 0,
+ [I_GTXX_DX] = 1,
+ [I_GTYY_DX] = 1,
+ [I_GTZZ_DX] = 1,
+ [I_GTXZ_DX] = 1,
+ [I_GTXX_DZ] = 3,
+ [I_GTYY_DZ] = 3,
+ [I_GTZZ_DZ] = 3,
+ [I_GTXZ_DZ] = 3,
+ [I_GTXX_DXX] = 11,
+ [I_GTYY_DXX] = 11,
+ [I_GTZZ_DXX] = 11,
+ [I_GTXZ_DXX] = 11,
+ [I_GTXX_DXZ] = 13,
+ [I_GTYY_DXZ] = 13,
+ [I_GTZZ_DXZ] = 13,
+ [I_GTXZ_DXZ] = 13,
+ [I_GTXX_DZZ] = 33,
+ [I_GTYY_DZZ] = 33,
+ [I_GTZZ_DZZ] = 33,
+ [I_GTXZ_DZZ] = 33,
+ [I_PHI] = 0,
+ [I_PHI_DX] = 1,
+ [I_PHI_DY] = 2,
+ [I_PHI_DZ] = 3,
+ [I_PHI_DXX] = 11,
+ [I_PHI_DZZ] = 33,
+ [I_PHI_DXZ] = 13,
+ [I_ATXX] = 0,
+ [I_ATYY] = 0,
+ [I_ATZZ] = 0,
+ [I_ATXY] = 0,
+ [I_ATXZ] = 0,
+ [I_ATYZ] = 0,
+ [I_ATXX_DX] = 1,
+ [I_ATYY_DX] = 1,
+ [I_ATZZ_DX] = 1,
+ [I_ATXZ_DX] = 1,
+ [I_ATXX_DZ] = 3,
+ [I_ATYY_DZ] = 3,
+ [I_ATZZ_DZ] = 3,
+ [I_ATXZ_DZ] = 3,
+ [I_XTX] = 0,
+ [I_XTY] = 0,
+ [I_XTZ] = 0,
+ [I_ALPHA] = 0,
+ [I_ALPHA_DX] = 1,
+ [I_ALPHA_DY] = 2,
+ [I_ALPHA_DZ] = 3,
+ [I_TRK] = 0,
+ [I_TRK_DX] = 1,
+ [I_TRK_DZ] = 3,
+};
+
+/* interpolate the cactus gridfunctions onto the pseudospectral grid */
+static int interp_geometry(MDSolver *ctx)
+{
+ MDSolverPriv *s = ctx->priv;
+ int ret;
+
+ for (int i = 0; i < ctx->nb_equations; i++) {
+ MDEquationContext *eq_ctx = &s->eqs[i];
+
+ ret = CCTK_InterpGridArrays(s->gh, 3, s->interp_operator, s->interp_params,
+ s->coord_system, NB_COLLOC_POINTS(ctx), CCTK_VARIABLE_REAL,
+ (const void * const *)eq_ctx->interp_coords, ARRAY_ELEMS(s->interp_vars_indices), s->interp_vars_indices,
+ ARRAY_ELEMS(eq_ctx->interp_values), s->interp_value_codes, (void * const *)eq_ctx->interp_values);
+ if (ret < 0)
+ CCTK_WARN(0, "Error interpolating");
+ }
+
+ return 0;
+}
+
+#if 0
+#define EQUATION 0
+#include "md_solve_template.c"
+#undef EQUATION
+
+#define EQUATION 1
+#include "md_solve_template.c"
+#undef EQUATION
+#else
+#define EQUATION 0
+#include "gamma_freeze_template.c"
+#undef EQUATION
+
+#define EQUATION 1
+#include "gamma_freeze_template.c"
+#undef EQUATION
+#endif
+
+static void (*calc_eq_coeffs[2])(void *, unsigned int, unsigned int,
+ unsigned int, unsigned int) = {
+ calc_eq_coeffs_0,
+ calc_eq_coeffs_1,
+};
+
+int md_solver_solve(MDSolver *ctx)
+{
+ MDSolverPriv *s = ctx->priv;
+ const double *(*eq_coeffs[2])[PSSOLVE_DIFF_ORDER_NB];
+ int ret;
+ int64_t start, totaltime_start;
+
+ totaltime_start = gettime();
+
+ /* interpolate the metric values and construct the quantities we'll need */
+ CCTK_TimerStart("MinimalDistortion_interp_geometry");
+ start = gettime();
+
+ ret = interp_geometry(ctx);
+
+ s->interp_geometry_time += gettime() - start;
+ s->interp_geometry_count++;
+ CCTK_TimerStop("MinimalDistortion_interp_geometry");
+ if (ret < 0)
+ return ret;
+
+ CCTK_TimerStart("MinimalDistortion_calc_eq_coeffs");
+ start = gettime();
+
+ for (int i = 0; i < ctx->nb_equations; i++) {
+ MDCalcEqThread thread = {
+ .ctx = ctx,
+ .eq_ctx = &s->eqs[i],
+ .block_size = 256,
+ };
+
+ md_threadpool_execute(s->tp, (NB_COLLOC_POINTS(ctx) + thread.block_size - 1) / thread.block_size,
+ calc_eq_coeffs[i], &thread);
+ }
+
+ eq_coeffs[0] = s->eqs[0].eq_coeffs;
+ eq_coeffs[1] = s->eqs[1].eq_coeffs;
+
+ s->calc_eq_coeffs_time += gettime() - start;
+ s->calc_eq_coeffs_count++;
+ CCTK_TimerStop("MinimalDistortion_calc_eq_coeffs");
+ if (ret < 0)
+ return ret;
+
+ ret = md_pssolve_solve(s->ps_ctx,
+ eq_coeffs,
+ s->rhs, ctx->coeffs);
+ if (ret < 0)
+ return ret;
+
+ //for (int i = 0; i < ctx->nb_equations * NB_COEFFS(ctx); i++)
+ // ctx->coeffs[i] *= s->coeff_scale[i];
+
+ s->solve_count++;
+ s->solve_time += gettime() - totaltime_start;
+
+ return 0;
+}
+
+void md_solver_print_stats(MDSolver *ctx)
+{
+ MDSolverPriv *s = ctx->priv;
+
+ fprintf(stderr,
+ "%g%% interpolate geometry: %lu, "
+ "total time %g s, avg time per call %g ms\n",
+ (double)s->interp_geometry_time * 100 / s->solve_time,
+ s->interp_geometry_count, (double)s->interp_geometry_time / 1e6,
+ (double)s->interp_geometry_time / s->interp_geometry_count / 1e3);
+ fprintf(stderr,
+ "%g%% calc equation coefficients: %lu, "
+ "total time %g s, avg time per call %g ms\n",
+ (double)s->calc_eq_coeffs_time * 100 / s->solve_time,
+ s->calc_eq_coeffs_count, (double)s->calc_eq_coeffs_time / 1e6,
+ (double)s->calc_eq_coeffs_time / s->calc_eq_coeffs_count / 1e3);
+ fprintf(stderr,
+ "%g%% pseudospectral matrix construction: %lu, "
+ "total time %g s, avg time per call %g ms\n",
+ (double)s->ps_ctx->construct_matrix_time * 100 / s->solve_time,
+ s->ps_ctx->construct_matrix_count, (double)s->ps_ctx->construct_matrix_time / 1e6,
+ (double)s->ps_ctx->construct_matrix_time / s->ps_ctx->construct_matrix_count / 1e3);
+ fprintf(stderr,
+ "%g%% BiCGSTAB %lu solves, "
+ "%lu iterations, total time %g s, "
+ "avg iterations per solve %g, avg time per solve %g ms, "
+ "avg time per iteration %g ms\n",
+ (double)s->ps_ctx->cg_time_total * 100 / s->solve_time,
+ s->ps_ctx->cg_solve_count, s->ps_ctx->cg_iter_count, (double)s->ps_ctx->cg_time_total / 1e6,
+ (double)s->ps_ctx->cg_iter_count / s->ps_ctx->cg_solve_count,
+ (double)s->ps_ctx->cg_time_total / s->ps_ctx->cg_solve_count / 1e3,
+ (double)s->ps_ctx->cg_time_total / s->ps_ctx->cg_iter_count / 1e3);
+ fprintf(stderr,
+ "%g%% LU %lu solves, total time %g s, avg time per solve %g ms\n",
+ (double)s->ps_ctx->lu_solves_time * 100 / s->solve_time,
+ s->ps_ctx->lu_solves_count, (double)s->ps_ctx->lu_solves_time / 1e6,
+ (double)s->ps_ctx->lu_solves_time / s->ps_ctx->lu_solves_count / 1e3);
+}
+
+static void init_opencl(MDSolver *ctx)
+#if HAVE_OPENCL
+{
+ MDSolverPriv *s = ctx->priv;
+ int err, count;
+ cl_platform_id platform;
+ cl_context_properties props[3];
+ cl_device_id ocl_device;
+
+ err = clGetPlatformIDs(1, &platform, &count);
+ if (err != CL_SUCCESS || count < 1) {
+ fprintf(stderr, "Could not get an OpenCL platform ID\n");
+ return;
+ }
+
+ err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &ocl_device, &count);
+ if (err != CL_SUCCESS || count < 1) {
+ fprintf(stderr, "Could not get an OpenCL device ID\n");
+ return;
+ }
+
+ props[0] = CL_CONTEXT_PLATFORM;
+ props[1] = (cl_context_properties)platform;
+ props[2] = 0;
+
+ s->ocl_ctx = clCreateContext(props, 1, &ocl_device, NULL, NULL, &err);
+ if (err != CL_SUCCESS || !s->ocl_ctx) {
+ fprintf(stderr, "Could not create an OpenCL context\n");
+ return;
+ }
+
+ s->ocl_queue = clCreateCommandQueue(s->ocl_ctx, ocl_device, 0, &err);
+ if (err != CL_SUCCESS || !s->ocl_queue) {
+ fprintf(stderr, "Could not create an OpenCL command queue: %d\n", err);
+ goto fail;
+ }
+
+ err = clblasSetup();
+ if (err != CL_SUCCESS) {
+ fprintf(stderr, "Error setting up clBLAS\n");
+ goto fail;
+ }
+
+ return;
+fail:
+ if (s->ocl_queue)
+ clReleaseCommandQueue(s->ocl_queue);
+ s->ocl_queue = 0;
+
+ if (s->ocl_ctx)
+ clReleaseContext(s->ocl_ctx);
+ s->ocl_ctx = 0;
+}
+#else
+{
+}
+#endif
+
+static int eq_init(MDSolver *ctx, unsigned int eq_idx)
+{
+ MDSolverPriv *s = ctx->priv;
+ MDEquationContext *eq_ctx = &s->eqs[eq_idx];
+ double *colloc_grid[2] = { s->ps_ctx->colloc_grid[eq_idx][0],
+ s->ps_ctx->colloc_grid[eq_idx][1] };
+ int ret;
+
+ /* prepare the state for the cactus interpolator */
+ for (int i = 0; i < ARRAY_ELEMS(eq_ctx->interp_coords); i++) {
+ ret = posix_memalign((void**)&eq_ctx->interp_coords[i], 32,
+ NB_COLLOC_POINTS(ctx) * sizeof(*eq_ctx->interp_coords[i]));
+ if (ret)
+ return -ENOMEM;
+ }
+
+ for (int j = 0; j < ctx->nb_colloc_points[1]; j++) {
+ for (int i = 0; i < ctx->nb_colloc_points[0]; i++) {
+ eq_ctx->interp_coords[0][j * ctx->nb_colloc_points[0] + i] = colloc_grid[0][i];
+ eq_ctx->interp_coords[1][j * ctx->nb_colloc_points[0] + i] = 0;
+ eq_ctx->interp_coords[2][j * ctx->nb_colloc_points[0] + i] = colloc_grid[1][j];
+ }
+ }
+
+ for (int i = 0; i < ARRAY_ELEMS(eq_ctx->interp_values); i++) {
+ ret = posix_memalign((void**)&eq_ctx->interp_values[i], 32,
+ NB_COLLOC_POINTS(ctx) * sizeof(*eq_ctx->interp_values[i]));
+ if (ret)
+ return -ENOMEM;
+ }
+
+ /* allocate the equation coefficients */
+ eq_ctx->eq_coeffs = calloc(ctx->nb_equations, sizeof(*eq_ctx->eq_coeffs));
+ if (!eq_ctx->eq_coeffs)
+ return -ENOMEM;
+ for (int i = 0; i < ctx->nb_equations; i++)
+ for (int j = 0; j < ARRAY_ELEMS(eq_ctx->eq_coeffs[i]); j++) {
+ ret = posix_memalign((void**)&eq_ctx->eq_coeffs[i][j], 32,
+ NB_COLLOC_POINTS(ctx) * sizeof(*eq_ctx->eq_coeffs[i][j]));
+ if (ret)
+ return -ENOMEM;
+ }
+
+ /* setup the RHS pointer */
+ if (eq_idx == 0)
+ eq_ctx->rhs = s->rhs;
+ else
+ eq_ctx->rhs = s->eqs[eq_idx - 1].rhs + NB_COLLOC_POINTS(ctx);
+
+ return 0;
+}
+
+static const enum MDBasisFamily basis_sets[2][2] = {
+ { MD_BASIS_FAMILY_SB_ODD, MD_BASIS_FAMILY_SB_EVEN },
+ { MD_BASIS_FAMILY_SB_EVEN, MD_BASIS_FAMILY_SB_ODD },
+};
+
+int md_solver_init(MDSolver **pctx,
+ cGH *cctkGH, ThreadPoolContext *tp,
+ unsigned int nb_equations,
+ unsigned int (*basis_order)[2],
+ double sf, double filter_power, double input_filter_power)
+{
+ MDSolver *ctx;
+ MDSolverPriv *s;
+ int max_order = 0;
+ int ret;
+
+ ctx = calloc(1, sizeof(*ctx));
+ if (!ctx)
+ return -ENOMEM;
+
+ ctx->priv = calloc(1, sizeof(*ctx->priv));
+ if (!ctx->priv)
+ goto fail;
+ s = ctx->priv;
+
+ s->gh = cctkGH;
+
+ if (tp) {
+ s->tp = tp;
+ } else {
+ ret = md_threadpool_init(&s->tp_internal, 1);
+ if (ret < 0)
+ goto fail;
+ s->tp = s->tp_internal;
+ }
+
+ s->eqs = calloc(nb_equations, sizeof(*s->eqs));
+ if (!s->eqs)
+ goto fail;
+ ctx->nb_equations = nb_equations;
+
+ ctx->nb_coeffs[0] = basis_order[0][0];
+ ctx->nb_coeffs[1] = basis_order[0][1];
+
+ ctx->nb_colloc_points[0] = basis_order[0][0];
+ ctx->nb_colloc_points[1] = basis_order[0][1];
+
+ if (NB_COLLOC_POINTS(ctx) != NB_COEFFS(ctx))
+ CCTK_WARN(0, "Non-square collocation matrix");
+
+ s->colloc_grid_order[0] = ctx->nb_colloc_points[0];
+ s->colloc_grid_order[1] = ctx->nb_colloc_points[1];
+
+ ret = posix_memalign((void**)&ctx->coeffs, 32, sizeof(*ctx->coeffs) * nb_equations * NB_COEFFS(ctx));
+ ret |= posix_memalign((void**)&s->rhs, 32, sizeof(*s->rhs) * nb_equations * NB_COLLOC_POINTS(ctx));
+ if (ret)
+ goto fail;
+
+ for (int i = 0; i < ctx->nb_equations; i++)
+ for (int j = 0; j < 2; j++) {
+ double sf;
+
+ ret = md_basis_init(&ctx->basis[i][j], basis_sets[i][j], 1.0);
+ if (ret < 0)
+ goto fail;
+
+ sf = 64.0 / md_basis_colloc_point(ctx->basis[i][j], s->colloc_grid_order[j],
+ ctx->nb_colloc_points[j] - 1);
+ md_basis_free(&ctx->basis[i][j]);
+
+ ret = md_basis_init(&ctx->basis[i][j], basis_sets[i][j], sf);
+ if (ret < 0)
+ goto fail;
+ }
+
+ init_opencl(ctx);
+
+ ret = md_pssolve_context_alloc(&s->ps_ctx, 2);
+ if (ret < 0)
+ CCTK_WARN(0, "Error allocating the pseudospectral solver");
+
+ for (int i = 0; i < 2; i++)
+ for (int j = 0; j < 2; j++) {
+ s->ps_ctx->basis[i][j] = ctx->basis[i][j];
+ s->ps_ctx->solve_order[i][j] = basis_order[i][j];
+ max_order = MAX(max_order, basis_order[i][j]);
+ }
+
+ s->ps_ctx->tp = s->tp;
+
+#if HAVE_OPENCL
+ s->ps_ctx->ocl_ctx = s->ocl_ctx;
+ s->ps_ctx->ocl_queue = s->ocl_queue;
+#endif
+
+ ret = md_pssolve_context_init(s->ps_ctx);
+ if (ret < 0)
+ CCTK_WARN(0, "Error initializing the pseudospectral solver");
+
+ for (int i = 0; i < max_order; i++) {
+ fprintf(stderr, "%d ", i);
+ for (int j = 0; j < 2; j++)
+ for (int k = 0; k < 2; k++) {
+ if (i < s->ps_ctx->solve_order[j][k])
+ fprintf(stderr, "%8.8g\t", s->ps_ctx->colloc_grid[j][k][i]);
+ else
+ fprintf(stderr, " ");
+ }
+ fprintf(stderr, "\n");
+ }
+
+ /* init the per-equation state */
+ for (int i = 0; i < ctx->nb_equations; i++) {
+ ret = eq_init(ctx, i);
+ if (ret < 0)
+ goto fail;
+ }
+
+ ret = posix_memalign((void**)&s->coeff_scale, 32, 2 * NB_COEFFS(ctx) * sizeof(*s->coeff_scale));
+ if (ret)
+ goto fail;
+ for (int j = 0; j < ctx->nb_coeffs[1]; j++)
+ for (int i = 0; i < ctx->nb_coeffs[0]; i++) {
+ s->coeff_scale[j * ctx->nb_coeffs[0] + i] = exp(-36.0 * pow((double)i / ctx->nb_coeffs[0], filter_power)) *
+ exp(-36.0 * pow((double)j / ctx->nb_coeffs[1], filter_power));
+ s->coeff_scale[NB_COEFFS(ctx) + j * ctx->nb_coeffs[0] + i] = exp(-36.0 * pow((double)i / ctx->nb_coeffs[0], filter_power)) *
+ exp(-36.0 * pow((double)j / ctx->nb_coeffs[1], filter_power));
+ }
+
+ for (int i = 0; i < ARRAY_ELEMS(s->interp_values); i++) {
+#if 0
+ ret = posix_memalign((void**)&s->interp_values[i], 32,
+ 2 * NB_COLLOC_POINTS(ctx) * sizeof(*s->interp_values[i]));
+ if (ret)
+ goto fail;
+#endif
+ s->interp_value_codes[i] = CCTK_VARIABLE_REAL;
+ }
+
+ for (int i = 0; i < ARRAY_ELEMS(metric_vars); i++) {
+ s->interp_vars_indices[i] = CCTK_VarIndex(metric_vars[i]);
+ if (s->interp_vars_indices[i] < 0)
+ CCTK_VWarn(0, __LINE__, __FILE__, CCTK_THORNSTRING, "Error getting the index of variable: %s\n", metric_vars[i]);
+ }
+
+ s->coord_system = CCTK_CoordSystemHandle("cart3d");
+ if (s->coord_system < 0)
+ CCTK_WARN(0, "Error getting the coordinate system");
+
+ s->interp_operator = CCTK_InterpHandle("Lagrange polynomial interpolation (tensor product)");
+ if (s->interp_operator < 0)
+ CCTK_WARN(0, "Error getting the interpolation operator");
+
+ s->interp_params = Util_TableCreateFromString("order=4 want_global_mode=1");
+ if (s->interp_params < 0)
+ CCTK_WARN(0, "Error creating interpolation parameters table");
+
+ ret = Util_TableSetIntArray(s->interp_params, NB_INTERP_VARS,
+ interp_operation_codes, "operation_codes");
+ if (ret < 0)
+ CCTK_WARN(0, "Error setting operation codes");
+
+ ret = Util_TableSetIntArray(s->interp_params, NB_INTERP_VARS,
+ interp_operation_indices, "operand_indices");
+ if (ret < 0)
+ CCTK_WARN(0, "Error setting operand indices");
+
+ CCTK_TimerCreate("MinimalDistortion_Solve");
+ CCTK_TimerCreate("MinimalDistortion_Expand");
+ CCTK_TimerCreate("MinimalDistortion_interp_geometry");
+ CCTK_TimerCreate("MinimalDistortion_calc_eq_coeffs");
+ CCTK_TimerCreate("MinimalDistortion_construct_matrix");
+ CCTK_TimerCreate("MinimalDistortion_solve_LU");
+ CCTK_TimerCreate("MinimalDistortion_solve_BiCGSTAB");
+
+ *pctx = ctx;
+ return 0;
+fail:
+ md_solver_free(&ctx);
+ return -ENOMEM;
+}
+
+void md_solver_free(MDSolver **pctx)
+{
+ MDSolver *ctx = *pctx;
+
+ if (!ctx)
+ return;
+
+ if (ctx->priv) {
+ for (int i = 0; i < ARRAY_ELEMS(ctx->priv->interp_coords); i++)
+ free(ctx->priv->interp_coords[i]);
+ for (int i = 0; i < ARRAY_ELEMS(ctx->priv->interp_values); i++)
+ free(ctx->priv->interp_values[i]);
+ free(ctx->priv->rhs);
+ free(ctx->priv->coeff_scale);
+
+ for (int i = 0; i < ctx->nb_equations; i++) {
+ MDEquationContext *eq_ctx = &ctx->priv->eqs[i];
+ for (int j = 0; j < ARRAY_ELEMS(eq_ctx->interp_coords); j++)
+ free(eq_ctx->interp_coords[j]);
+ for (int j = 0; j < ARRAY_ELEMS(eq_ctx->interp_values); j++)
+ free(eq_ctx->interp_values[j]);
+
+ if (eq_ctx->eq_coeffs) {
+ for (int j = 0; j < ctx->nb_equations; j++)
+ for (int k = 0; k < ARRAY_ELEMS(eq_ctx->eq_coeffs[j]); k++)
+ free(eq_ctx->eq_coeffs[j][k]);
+ }
+ free(eq_ctx->eq_coeffs);
+ }
+ free(ctx->priv->eqs);
+
+ md_pssolve_context_free(&ctx->priv->ps_ctx);
+
+ md_threadpool_free(&ctx->priv->tp_internal);
+
+#if HAVE_OPENCL
+ if (ctx->priv->ocl_queue)
+ clReleaseCommandQueue(ctx->priv->ocl_queue);
+ if (ctx->priv->ocl_ctx)
+ clReleaseContext(ctx->priv->ocl_ctx);
+#endif
+ }
+
+ free(ctx->priv);
+
+ free(ctx->coeffs);
+
+ free(ctx);
+ *pctx = NULL;
+}
diff --git a/src/md_solve.h b/src/md_solve.h
new file mode 100644
index 0000000..07d313a
--- /dev/null
+++ b/src/md_solve.h
@@ -0,0 +1,58 @@
+/*
+ * Quasimaximal slicing -- actual solver code
+ * Copyright (C) 2016 Anton Khirnov <anton@khirnov.net>
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef MD_SOLVE_H
+#define MD_SOLVE_H
+
+#include "common.h"
+
+#include "cctk.h"
+
+#include "basis.h"
+#include "threadpool.h"
+
+typedef struct MDSolverPriv MDSolverPriv;
+
+typedef struct MDSolver {
+ MDSolverPriv *priv;
+
+ unsigned int nb_equations;
+
+ MDBasisSetContext *basis[2][2];
+
+ int nb_coeffs[2];
+ int nb_colloc_points[2];
+
+ double *coeffs;
+
+ ThreadPoolContext *tp;
+} MDSolver;
+
+int md_solver_init(MDSolver **ctx,
+ cGH *cctkGH, ThreadPoolContext *tp,
+ unsigned int nb_equations,
+ unsigned int (*basis_order)[2],
+ double sf, double filter_power, double input_filter_power);
+
+void md_solver_free(MDSolver **ctx);
+
+int md_solver_solve(MDSolver *ctx);
+
+void md_solver_print_stats(MDSolver *ctx);
+
+#endif /* MD_SOLVE_H */
diff --git a/src/md_solve_template.c b/src/md_solve_template.c
new file mode 100644
index 0000000..260405e
--- /dev/null
+++ b/src/md_solve_template.c
@@ -0,0 +1,577 @@
+/*
+ * Minimal distortion -- template for the equations definitions
+ * Copyright (C) 2016 Anton Khirnov <anton@khirnov.net>
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#define FUNC3(a, b) a ## _ ## b
+#define FUNC2(a, b) FUNC3(a, b)
+#define FUNC(name) FUNC2(name, EQUATION)
+
+/**
+ * A template for calculating the equation coefficients.
+ */
+static void FUNC(calc_eq_coeffs)(void *arg,
+ unsigned int job_idx, unsigned int nb_jobs,
+ unsigned int thread_idx, unsigned int nb_threads)
+{
+ const MDCalcEqThread *et = arg;
+ const MDSolver *ctx = et->ctx;
+ MDEquationContext *eq_ctx = et->eq_ctx;
+
+ const int start = job_idx * et->block_size;
+ const int end = MIN((job_idx + 1) * et->block_size, NB_COLLOC_POINTS(ctx));
+
+ for (int i = start; i < end; i++) {
+ const double x = eq_ctx->interp_coords[0][i];
+ const double z = eq_ctx->interp_coords[2][i];
+ const int zaxis = x <= EPS;
+
+ double c1o3 = (1.0 / 3.0);
+
+ double gtu[3][3], g[3][3], gu[3][3];
+ double dg[3][3][3], d2g[3][3][3][3], dgu[3][3][3], G[3][3][3], dG[3][3][3][3];
+ double A[3][3], Au[3][3];
+ double dA[3][3][3], dAu[3][3][3];
+ double Ric[3][3], Ricm[3][3];
+ double rhs_x, rhs_z;
+
+ const double gtxx = eq_ctx->interp_values[I_GTXX][i];
+ const double gtyy = eq_ctx->interp_values[I_GTYY][i];
+ const double gtzz = eq_ctx->interp_values[I_GTZZ][i];
+ const double gtxy = eq_ctx->interp_values[I_GTXY][i];
+ const double gtxz = eq_ctx->interp_values[I_GTXZ][i];
+ const double gtyz = eq_ctx->interp_values[I_GTYZ][i];
+
+ const double gt[3][3] = {{ gtxx, gtxy, gtxz },
+ { gtxy, gtyy, gtyz },
+ { gtxz, gtyz, gtzz }};
+
+ const double dx_gt11 = eq_ctx->interp_values[I_GTXX_DX][i];
+ const double dx_gt22 = eq_ctx->interp_values[I_GTYY_DX][i];
+ const double dx_gt33 = eq_ctx->interp_values[I_GTZZ_DX][i];
+ const double dx_gt13 = eq_ctx->interp_values[I_GTXZ_DX][i];
+
+ const double dz_gt11 = eq_ctx->interp_values[I_GTXX_DZ][i];
+ const double dz_gt22 = eq_ctx->interp_values[I_GTYY_DZ][i];
+ const double dz_gt33 = eq_ctx->interp_values[I_GTZZ_DZ][i];
+ const double dz_gt13 = eq_ctx->interp_values[I_GTXZ_DZ][i];
+
+ const double dgt[3][3][3] = {
+ {
+ { dx_gt11, 0.0, dx_gt13 },
+ { 0.0, dx_gt22, 0.0 },
+ { dx_gt13, 0.0, dx_gt33 },
+ },
+ {
+ { 0.0, zaxis ? dx_gt11 - dx_gt22 : (gtxx - gtyy) / x, 0.0 },
+ { zaxis ? dx_gt11 - dx_gt22 : (gtxx - gtyy) / x, 0.0, zaxis ? dx_gt13 : gtxz / x },
+ { 0.0, zaxis ? dx_gt13 : gtxz / x, 0.0 },
+ },
+ {
+ { dz_gt11, 0.0, dz_gt13 },
+ { 0.0, dz_gt22, 0.0 },
+ { dz_gt13, 0.0, dz_gt33 },
+ },
+ };
+
+ const double dxx_gt11 = eq_ctx->interp_values[I_GTXX_DXX][i];
+ const double dxx_gt22 = eq_ctx->interp_values[I_GTYY_DXX][i];
+ const double dxx_gt33 = eq_ctx->interp_values[I_GTZZ_DXX][i];
+ const double dxx_gt13 = eq_ctx->interp_values[I_GTXZ_DXX][i];
+
+ const double dxz_gt11 = eq_ctx->interp_values[I_GTXX_DXZ][i];
+ const double dxz_gt22 = eq_ctx->interp_values[I_GTYY_DXZ][i];
+ const double dxz_gt33 = eq_ctx->interp_values[I_GTZZ_DXZ][i];
+ const double dxz_gt13 = eq_ctx->interp_values[I_GTXZ_DXZ][i];
+
+ const double dzz_gt11 = eq_ctx->interp_values[I_GTXX_DZZ][i];
+ const double dzz_gt22 = eq_ctx->interp_values[I_GTYY_DZZ][i];
+ const double dzz_gt33 = eq_ctx->interp_values[I_GTZZ_DZZ][i];
+ const double dzz_gt13 = eq_ctx->interp_values[I_GTXZ_DZZ][i];
+
+ const double d2gt[3][3][3][3] = {
+ {
+ {
+ { dxx_gt11, 0.0, dxx_gt13 },
+ { 0.0, dxx_gt22, 0.0 },
+ { dxx_gt13, 0.0, dxx_gt33 },
+ },
+ {
+ { 0.0, zaxis ? 0.5 * (dxx_gt11 - dxx_gt22) : (dx_gt11 - dx_gt22) / x - (gtxx - gtyy) / SQR(x), 0.0 },
+ { zaxis ? 0.5 * (dxx_gt11 - dxx_gt22) : (dx_gt11 - dx_gt22) / x - (gtxx - gtyy) / SQR(x), 0.0,
+ zaxis ? 0.5 * dxx_gt13 : dx_gt13 / x - gtxz / SQR(x) },
+ { 0.0, zaxis ? 0.5 * dxx_gt13 : dx_gt13 / x - gtxz / SQR(x), 0.0 },
+ },
+ {
+ { dxz_gt11, 0.0, dxz_gt13 },
+ { 0.0, dxz_gt22, 0.0 },
+ { dxz_gt13, 0.0, dxz_gt33 },
+ },
+
+ },
+ {
+ {
+ { 0.0, zaxis ? 0.5 * (dxx_gt11 - dxx_gt22) : (dx_gt11 - dx_gt22) / x - (gtxx - gtyy) / SQR(x), 0.0 },
+ { zaxis ? 0.5 * (dxx_gt11 - dxx_gt22) : (dx_gt11 - dx_gt22) / x - (gtxx - gtyy) / SQR(x), 0.0,
+ zaxis ? 0.5 * dxx_gt13 : dx_gt13 / x - gtxz / SQR(x) },
+ { 0.0, zaxis ? 0.5 * dxx_gt13 : dx_gt13 / x - gtxz / SQR(x), 0.0 },
+ },
+ {
+ { zaxis ? dxx_gt22 : dx_gt11 / x - 2 * (gtxx - gtyy) / SQR(x), 0.0,
+ zaxis ? 0.5 * dxx_gt13 : dx_gt13 / x - gtxz / SQR(x) },
+ { 0.0, zaxis ? dxx_gt11 : dx_gt22 / x + 2.0 * (gtxx - gtyy) / SQR(x), 0.0 },
+ { zaxis ? 0.5 * dxx_gt13 : dx_gt13 / x - gtxz / SQR(x), 0.0, zaxis ? dxx_gt33 : dx_gt33 / x },
+ },
+ {
+ { 0.0, zaxis ? dxz_gt11 - dxz_gt22 : (dz_gt11 - dz_gt22) / x, 0.0 },
+ { zaxis ? dxz_gt11 - dxz_gt22 : (dz_gt11 - dz_gt22) / x, 0.0,
+ zaxis ? dxz_gt13 : dz_gt13 / x },
+ { 0.0, zaxis ? dxz_gt13 : dz_gt13 / x, 0.0 },
+ },
+
+ },
+ {
+ {
+ { dxz_gt11, 0.0, dxz_gt13 },
+ { 0.0, dxz_gt22, 0.0 },
+ { dxz_gt13, 0.0, dxz_gt33 },
+ },
+ {
+ { 0.0, zaxis ? dxz_gt11 - dxz_gt22 : (dz_gt11 - dz_gt22) / x, 0.0 },
+ { zaxis ? dxz_gt11 - dxz_gt22 : (dz_gt11 - dz_gt22) / x, 0.0,
+ zaxis ? dxz_gt13 : dz_gt13 / x },
+ { 0.0, zaxis ? dxz_gt13 : dz_gt13 / x, 0.0 },
+ },
+ {
+ { dzz_gt11, 0.0, dzz_gt13 },
+ { 0.0, dzz_gt22, 0.0 },
+ { dzz_gt13, 0.0, dzz_gt33 },
+ },
+
+ },
+ };
+
+ const double Atxx = eq_ctx->interp_values[I_ATXX][i];
+ const double Atyy = eq_ctx->interp_values[I_ATYY][i];
+ const double Atzz = eq_ctx->interp_values[I_ATZZ][i];
+ const double Atxy = eq_ctx->interp_values[I_ATXY][i];
+ const double Atxz = eq_ctx->interp_values[I_ATXZ][i];
+ const double Atyz = eq_ctx->interp_values[I_ATYZ][i];
+
+ const double dx_At11 = eq_ctx->interp_values[I_ATXX_DX][i];
+ const double dx_At22 = eq_ctx->interp_values[I_ATYY_DX][i];
+ const double dx_At33 = eq_ctx->interp_values[I_ATZZ_DX][i];
+ const double dx_At13 = eq_ctx->interp_values[I_ATXZ_DX][i];
+
+ const double dz_At11 = eq_ctx->interp_values[I_ATXX_DZ][i];
+ const double dz_At22 = eq_ctx->interp_values[I_ATYY_DZ][i];
+ const double dz_At33 = eq_ctx->interp_values[I_ATZZ_DZ][i];
+ const double dz_At13 = eq_ctx->interp_values[I_ATXZ_DZ][i];
+
+ const double dAt[3][3][3] = {
+ {
+ { dx_At11, 0.0, dx_At13 },
+ { 0.0, dx_At22, 0.0 },
+ { dx_At13, 0.0, dx_At33 },
+ },
+ {
+ { 0.0, zaxis ? dx_At11 - dx_At22 : (Atxx - Atyy) / x, 0.0 },
+ { zaxis ? dx_At11 - dx_At22 : (Atxx - Atyy) / x, 0.0, zaxis ? dx_At13 : Atxz / x },
+ { 0.0, zaxis ? dx_At13 : Atxz / x, 0.0 },
+ },
+ {
+ { dz_At11, 0.0, dz_At13 },
+ { 0.0, dz_At22, 0.0 },
+ { dz_At13, 0.0, dz_At33 },
+ },
+ };
+
+ const double phi = eq_ctx->interp_values[I_PHI][i];
+
+ const double phi_dx = eq_ctx->interp_values[I_PHI_DX][i];
+ const double phi_dz = eq_ctx->interp_values[I_PHI_DZ][i];
+
+ const double dphi[3] = { phi_dx, 0.0, phi_dz };
+
+ const double phi_dxx = eq_ctx->interp_values[I_PHI_DXX][i];
+ const double phi_dzz = eq_ctx->interp_values[I_PHI_DZZ][i];
+ const double phi_dxz = eq_ctx->interp_values[I_PHI_DXZ][i];
+
+ const double d2phi[3][3] = {
+ { phi_dxx, 0.0, phi_dxz },
+ { 0.0, zaxis ? phi_dxx : phi_dx / x, 0.0 },
+ { phi_dxz, 0.0, phi_dzz },
+ };
+
+ const double At[3][3] = {{ Atxx, Atxy, Atxz },
+ { Atxy, Atyy, Atyz },
+ { Atxz, Atyz, Atzz }};
+
+ const double alpha = eq_ctx->interp_values[I_ALPHA][i];
+ const double dx_alpha = eq_ctx->interp_values[I_ALPHA_DX][i];
+ const double dz_alpha = eq_ctx->interp_values[I_ALPHA_DZ][i];
+
+ const double dalpha[3] = { dx_alpha, 0.0, dz_alpha };
+
+ const double Xtx = eq_ctx->interp_values[I_XTX][i];
+ const double Xtz = eq_ctx->interp_values[I_XTZ][i];
+
+ const double det = gtxx * gtyy * gtzz + 2 * gtxy * gtyz * gtxz - gtzz * SQR(gtxy) - SQR(gtxz) * gtyy - gtxx * SQR(gtyz);
+
+ // \tilde{γ}^{ij}
+ gtu[0][0] = (gtyy * gtzz - SQR(gtyz)) / det;
+ gtu[1][1] = (gtxx * gtzz - SQR(gtxz)) / det;
+ gtu[2][2] = (gtxx * gtyy - SQR(gtxy)) / det;
+ gtu[0][1] = -(gtxy * gtzz - gtyz * gtxz) / det;
+ gtu[0][2] = (gtxy * gtyz - gtyy * gtxz) / det;
+ gtu[1][2] = -(gtxx * gtyz - gtxy * gtxz) / det;
+ gtu[1][0] = gtu[0][1];
+ gtu[2][0] = gtu[0][2];
+ gtu[2][1] = gtu[1][2];
+
+ // γ_{jk}/^{jk}
+ for (int j = 0; j < 3; j++)
+ for (int k = 0; k < 3; k++) {
+ gu[j][k] = SQR(phi) * gtu[j][k];
+ g[j][k] = gt[j][k] / SQR(phi);
+ }
+
+ // ∂_j γ_{kl}
+ for (int j = 0; j < 3; j++)
+ for (int k = 0; k < 3; k++)
+ for (int l = 0; l < 3; l++) {
+ dg[j][k][l] = -2.0 * dphi[j] * gt[k][l] / (phi * SQR(phi)) + dgt[j][k][l] / SQR(phi);
+ dA[j][k][l] = -2.0 * dphi[j] * At[k][l] / (phi * SQR(phi)) + dAt[j][k][l] / SQR(phi);
+ }
+
+ // ∂_j γ^{kl}
+ for (int j = 0; j < 3; j++)
+ for (int k = 0; k < 3; k++)
+ for (int l = 0; l < 3; l++) {
+ double val = 0.0;
+ for (int m = 0; m < 3; m++)
+ for (int n = 0; n < 3; n++)
+ val += -gu[k][m] * gu[l][n] * dg[j][m][n];
+ dgu[j][k][l] = val;
+ }
+
+ // ∂_{jk} g_{lm}
+ for (int j = 0; j < 3; j++)
+ for (int k = 0; k < 3; k++)
+ for (int l = 0; l < 3; l++)
+ for (int m = 0; m < 3; m++) {
+ d2g[j][k][l][m] = 6.0 * gt [l][m] * dphi[j] * dphi[k] / SQR(SQR(phi)) -
+ 2.0 * gt [l][m] * d2phi[j][k] / (phi * SQR(phi)) -
+ 2.0 * dgt [j][l][m] * dphi[k] / (phi * SQR(phi)) -
+ 2.0 * dgt [k][l][m] * dphi[j] / (phi * SQR(phi)) +
+ d2gt[j][k][l][m] / SQR(phi);
+ }
+
+ // Γ^j_{kl}
+ for (int j = 0; j < 3; j++)
+ for (int k = 0; k < 3; k++)
+ for (int l = 0; l < 3; l++) {
+ double val = 0.0;
+ for (int m = 0; m < 3; m++)
+ val += 0.5 * gu[j][m] * (dg[k][l][m] + dg[l][k][m] - dg[m][k][l]);
+ G[j][k][l] = val;
+ }
+
+ // ∂_j Γ^k_{lm}
+ for (int j = 0; j < 3; j++)
+ for (int k = 0; k < 3; k++)
+ for (int l = 0; l < 3; l++)
+ for (int m = 0; m < 3; m++) {
+ double val = 0.0;
+ for (int n = 0; n < 3; n++) {
+ val += dgu[j][k][n] * (dg [l][m][n] + dg [m][l][n] - dg [n][l][m]) +
+ gu [k][n] * (d2g[j][l][m][n] + d2g[j][m][l][n] - d2g[j][n][l][m]);
+ }
+ dG[j][k][l][m] = 0.5 * val;
+ }
+
+ // Ric_{jk}
+ for (int j = 0; j < 3; j++)
+ for (int k = 0; k < 3; k++) {
+ double val = 0.0;
+ for (int m = 0; m < 3; m++)
+ val += dG[m][m][j][k] - dG[k][m][j][m];
+ for (int m = 0; m < 3; m++)
+ for (int l = 0; l < 3; l++)
+ val += G[l][l][m] * G[m][j][k] - G[l][k][m] * G[m][j][l];
+ Ric[j][k] = val;
+ }
+
+ // Ric^j_k
+ for (int j = 0; j < 3; j++)
+ for (int k = 0; k < 3; k++) {
+ double val = 0.0;
+ for (int l = 0; l < 3; l++)
+ val += gu[j][l] * Ric[l][k];
+ Ricm[j][k] = val;
+ }
+
+ // A_{jk}
+ for (int j = 0; j < 3; j++)
+ for (int k = 0; k < 3; k++) {
+ A[j][k] = At[j][k] / SQR(phi);
+ }
+
+ // d_j A^{kl}
+ for (int j = 0; j < 3; j++)
+ for (int k = 0; k < 3; k++)
+ for (int l = 0; l < 3; l++) {
+ double val = 0.0;
+ for (int m = 0; m < 3; m++)
+ for (int n = 0; n < 3; n++)
+ val += dgu[j][k][m] * gu[l][n] * A[m][n] + gu[k][m] * dgu[j][l][n] * A[m][n] + gu[k][m] * gu[l][n] * dA[j][m][n];
+ dAu[j][k][l] = val;
+ }
+
+ // A^{jk}
+ for (int j = 0; j < 3; j++)
+ for (int k = 0; k < 3; k++) {
+ double val = 0.0;
+ for (int l = 0; l < 3; l++)
+ for (int m = 0; m < 3; m++)
+ val += gu[j][l] * gu[k][m] * A[l][m];
+ Au[j][k] = val;
+ }
+
+ rhs_x = 0.0;
+ rhs_z = 0.0;
+ for (int j = 0; j < 3; j++) {
+ rhs_x += dalpha[j] * Au[0][j];
+ rhs_z += dalpha[j] * Au[2][j];
+ }
+ for (int j = 0; j < 3; j++) {
+ rhs_x += alpha * dAu[j][0][j];
+ rhs_z += alpha * dAu[j][2][j];
+ }
+ for (int j = 0; j < 3; j++) {
+ double val_x = 0.0;
+ double val_z = 0.0;
+ for (int k = 0; k < 3; k++) {
+ val_x += G[0][j][k] * Au[k][j];
+ val_z += G[2][j][k] * Au[k][j];
+ }
+ rhs_x += val_x * alpha;
+ rhs_z += val_z * alpha;
+ }
+ for (int j = 0; j < 3; j++) {
+ double val_x = 0.0;
+ double val_z = 0.0;
+ for (int k = 0; k < 3; k++) {
+ val_x += G[j][j][k] * Au[0][k];
+ val_z += G[j][j][k] * Au[2][k];
+ }
+ rhs_x += val_x * alpha;
+ rhs_z += val_z * alpha;
+ }
+
+ rhs_x *= 2.0;
+ rhs_z *= 2.0;
+
+ double X[3] = { 0.0 };
+ for (int j = 0; j < 3; j++)
+ for (int k = 0; k < 3; k++) {
+ X[0] += gu[j][k] * G[0][j][k];
+ X[2] += gu[j][k] * G[2][j][k];
+ }
+
+ if (EQUATION == 0) {
+ /* eq 0 */
+ /* ∂_{xx}β^x */
+ eq_ctx->eq_coeffs[0][PSSOLVE_DIFF_ORDER_20][i] = gu[0][0] + c1o3 * gu[0][0] + (zaxis ? 0.5 * (gu[1][1] + c1o3 * gu[0][0]) : 0.0);
+ /* ∂_{xx}β^z */
+ eq_ctx->eq_coeffs[1][PSSOLVE_DIFF_ORDER_20][i] = 0.0;
+ /* ∂_{zz}β^x */
+ eq_ctx->eq_coeffs[0][PSSOLVE_DIFF_ORDER_02][i] = gu[2][2];
+ /* ∂_{zz}β^z */
+ eq_ctx->eq_coeffs[1][PSSOLVE_DIFF_ORDER_02][i] = c1o3 * gu[0][2];
+
+ /* ∂_{xz}β^x */
+ eq_ctx->eq_coeffs[0][PSSOLVE_DIFF_ORDER_11][i] = 2.0 * gu[0][2] + c1o3 * gu[0][2] + (zaxis ? c1o3 * gu[0][2] : 0.0);
+ /* ∂_{xz}β^z */
+ eq_ctx->eq_coeffs[1][PSSOLVE_DIFF_ORDER_11][i] = c1o3 * gu[0][0];
+
+ /* ∂_{x}β^x */
+ {
+ double t0 = 0.0;
+ double t1 = 0.0;
+ for (int j = 0; j < 3; j++) {
+ t0 += gu[0][j] * G[0][j][0];
+ t1 += G[j][j][0];
+ }
+ eq_ctx->eq_coeffs[0][PSSOLVE_DIFF_ORDER_10][i] = 2.0 * t0 - X[0] + c1o3 * gu[0][0] * t1 + (zaxis ? 2.0 * gu[1][1] * G[0][1][1] : (gu[1][1] + c1o3 * gu[0][0]) / x);
+ }
+ /* ∂_{x}β^z */
+ {
+ double t0 = 0.0;
+ double t1 = 0.0;
+ for (int j = 0; j < 3; j++) {
+ t0 += gu[0][j] * G[0][j][2];
+ t1 += G[j][j][2];
+ }
+ eq_ctx->eq_coeffs[1][PSSOLVE_DIFF_ORDER_10][i] = 2.0 * t0 + c1o3 * gu[0][0] * t1;
+ }
+
+ /* ∂_{z}β^x */
+ {
+ double t0 = 0.0;
+ double t1 = 0.0;
+ for (int j = 0; j < 3; j++) {
+ t0 += gu[2][j] * G[0][j][0];
+ t1 += G[j][j][0];
+ }
+ eq_ctx->eq_coeffs[0][PSSOLVE_DIFF_ORDER_01][i] = 2.0 * t0 - X[2] + c1o3 * gu[0][2] * t1 + (zaxis ? 0.0 : c1o3 * gu[0][2] / x);
+ }
+ /* ∂_{z}β^z */
+ {
+ double t0 = 0.0;
+ double t1 = 0.0;
+ for (int j = 0; j < 3; j++) {
+ t0 += gu[2][j] * G[0][j][2];
+ t1 += G[j][j][2];
+ }
+ eq_ctx->eq_coeffs[1][PSSOLVE_DIFF_ORDER_01][i] = 2.0 * t0 + c1o3 * gu[0][2] * t1;
+ }
+
+ /* β^x */
+ {
+ double t0 = 0.0;
+ double t1 = 0.0;
+ for (int k = 0; k < 3; k++) {
+ for (int l = 0; l < 3; l++) {
+ double val = 0.0;
+ for (int j = 0; j < 3; j++)
+ val += G[0][k][j] * G[j][l][0] - G[j][k][l] * G[0][0][j];
+ t0 += gu[k][l] * (dG[k][0][l][0] + val);
+ t1 += gu[0][k] * dG[k][l][l][0];
+ }
+ }
+ eq_ctx->eq_coeffs[0][PSSOLVE_DIFF_ORDER_00][i] = t0 + c1o3 * t1 + Ricm[0][0] + (zaxis ? 0.0 : 2.0 * gu[1][1] * G[0][1][1] / x - (gu[1][1] + c1o3 * gu[0][0]) / SQR(x));
+ }
+
+ /* β^z */
+ {
+ double t0 = 0.0;
+ double t1 = 0.0;
+ for (int k = 0; k < 3; k++) {
+ for (int l = 0; l < 3; l++) {
+ double val = 0.0;
+ for (int j = 0; j < 3; j++)
+ val += G[0][k][j] * G[j][l][2] - G[j][k][l] * G[0][2][j];
+ t0 += gu[k][l] * (dG[k][0][l][2] + val);
+ t1 += gu[0][k] * dG[k][l][l][2];
+ }
+ }
+ eq_ctx->eq_coeffs[1][PSSOLVE_DIFF_ORDER_00][i] = t0 + c1o3 * t1 + Ricm[0][2];
+ }
+
+ eq_ctx->rhs[i] = rhs_x;
+ } else {
+ /* eq 1 */
+ /* ∂_{xx}β^x */
+ eq_ctx->eq_coeffs[0][PSSOLVE_DIFF_ORDER_20][i] = c1o3 * gu[2][0] + (zaxis ? c1o3 * 0.5 * gu[2][0] : 0.0);
+ /* ∂_{xx}β^z */
+ eq_ctx->eq_coeffs[1][PSSOLVE_DIFF_ORDER_20][i] = gu[0][0] + (zaxis ? gu[1][1] : 0.0);
+ /* ∂_{zz}β^x */
+ eq_ctx->eq_coeffs[0][PSSOLVE_DIFF_ORDER_02][i] = 0.0;
+ /* ∂_{zz}β^z */
+ eq_ctx->eq_coeffs[1][PSSOLVE_DIFF_ORDER_02][i] = gu[2][2] + c1o3 * gu[2][2];
+ /* ∂_{xz}β^x */
+ eq_ctx->eq_coeffs[0][PSSOLVE_DIFF_ORDER_11][i] = c1o3 * gu[2][2] + (zaxis ? c1o3 * gu[2][2] : 0.0);
+ /* ∂_{xz}β^z */
+ eq_ctx->eq_coeffs[1][PSSOLVE_DIFF_ORDER_11][i] = 2.0 * gu[0][2] + c1o3 * gu[0][2];
+
+ /* ∂_{x}β^x */
+ {
+ double t0 = 0.0;
+ double t1 = 0.0;
+ for (int j = 0; j < 3; j++) {
+ t0 += gu[0][j] * G[2][j][0];
+ t1 += G[j][j][0];
+ }
+ eq_ctx->eq_coeffs[0][PSSOLVE_DIFF_ORDER_10][i] = 2.0 * t0 + c1o3 * gu[2][0] * t1 + (zaxis ? 2.0 * gu[1][1] * G[2][1][1] : c1o3 * gu[2][0] / x);
+ }
+ /* ∂_{x}β^z */
+ {
+ double t0 = 0.0;
+ double t1 = 0.0;
+ for (int j = 0; j < 3; j++) {
+ t0 += gu[0][j] * G[2][j][2];
+ t1 += G[j][j][2];
+ }
+ eq_ctx->eq_coeffs[1][PSSOLVE_DIFF_ORDER_10][i] = 2.0 * t0 - X[0] + c1o3 * gu[2][0] * t1 + (zaxis ? 0.0 : gu[1][1] / x);
+ }
+ /* ∂_{z}β^x */
+ {
+ double t0 = 0.0;
+ double t1 = 0.0;
+ for (int j = 0; j < 3; j++) {
+ t0 += gu[2][j] * G[2][j][0];
+ t1 += G[j][j][0];
+ }
+ eq_ctx->eq_coeffs[0][PSSOLVE_DIFF_ORDER_01][i] = 2.0 * t0 + c1o3 * gu[2][2] * t1 + (zaxis ? 0.0 : c1o3 * gu[2][2] / x);
+ }
+ /* ∂_{z}β^z */
+ {
+ double t0 = 0.0;
+ double t1 = 0.0;
+ for (int j = 0; j < 3; j++) {
+ t0 += gu[2][j] * G[2][j][2];
+ t1 += G[j][j][2];
+ }
+ eq_ctx->eq_coeffs[1][PSSOLVE_DIFF_ORDER_01][i] = 2.0 * t0 - X[2] + c1o3 * gu[2][2] * t1;
+ }
+
+ /* β^x */
+ {
+ double t0 = 0.0;
+ double t1 = 0.0;
+ for (int k = 0; k < 3; k++) {
+ for (int l = 0; l < 3; l++) {
+ double val = 0.0;
+ for (int j = 0; j < 3; j++)
+ val += G[2][k][j] * G[j][l][0] - G[j][k][l] * G[2][0][j];
+ t0 += gu[k][l] * (dG[k][2][l][0] + val);
+ t1 += gu[2][k] * dG[k][l][l][0];
+ }
+ }
+ eq_ctx->eq_coeffs[0][PSSOLVE_DIFF_ORDER_00][i] = t0 + c1o3 * t1 + Ricm[2][0] + (zaxis ? 0.0 : 2.0 * gu[1][1] * G[2][1][1] / x - c1o3 * gu[2][0] / SQR(x));
+ }
+
+ /* β^z */
+ {
+ double t0 = 0.0;
+ double t1 = 0.0;
+ for (int k = 0; k < 3; k++) {
+ for (int l = 0; l < 3; l++) {
+ double val = 0.0;
+ for (int j = 0; j < 3; j++)
+ val += G[2][k][j] * G[j][l][2] - G[j][k][l] * G[2][2][j];
+ t0 += gu[k][l] * (dG[k][2][l][2] + val);
+ t1 += gu[2][k] * dG[k][l][l][2];
+ }
+ }
+ eq_ctx->eq_coeffs[1][PSSOLVE_DIFF_ORDER_00][i] = t0 + c1o3 * t1 + Ricm[2][2];
+ }
+
+ eq_ctx->rhs[i] = rhs_z;
+ }
+ }
+}
diff --git a/src/pssolve.c b/src/pssolve.c
new file mode 100644
index 0000000..1f5bb44
--- /dev/null
+++ b/src/pssolve.c
@@ -0,0 +1,498 @@
+/*
+ * Pseudospectral 2nd order 2D linear PDE solver
+ * Copyright (C) 2016 Anton Khirnov <anton@khirnov.net>
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <errno.h>
+#include <inttypes.h>
+#include <limits.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <cblas.h>
+#include <lapacke.h>
+
+#include "bicgstab.h"
+#include "pssolve.h"
+#include "threadpool.h"
+
+#define NB_COEFFS(eq_ctx) ((eq_ctx)->nb_coeffs[0] * (eq_ctx)->nb_coeffs[1])
+#define NB_COLLOC_POINTS(eq_ctx) ((eq_ctx)->nb_colloc_points[0] * (eq_ctx)->nb_colloc_points[1])
+
+typedef struct PSEquationContext {
+ size_t nb_coeffs[2];
+ size_t nb_colloc_points[2];
+ size_t colloc_grid_order[2];
+
+ double *(*basis_val)[PSSOLVE_DIFF_ORDER_NB];
+ double *mat;
+} PSEquationContext;
+
+struct PSSolvePriv {
+ BiCGStabContext *bicgstab;
+ int steps_since_inverse;
+
+ size_t nb_coeffs;
+
+ PSEquationContext *eqs;
+
+ int *ipiv;
+ double *mat;
+
+ ThreadPoolContext *tp;
+ ThreadPoolContext *tp_internal;
+};
+
+typedef struct ConstructMatrixThread {
+ const PSEquationContext *eq_ctx;
+ const double **eq_coeffs;
+ double *mat;
+ ptrdiff_t mat_stride;
+ unsigned int var_idx;
+} ConstructMatrixThread;
+
+static void construct_matrix(void *arg,
+ unsigned int job_idx, unsigned int nb_jobs,
+ unsigned int thread_idx, unsigned int nb_threads)
+{
+ ConstructMatrixThread *cmt = arg;
+ const PSEquationContext *eq_ctx = cmt->eq_ctx;
+ const double **eq_coeffs = cmt->eq_coeffs;
+ double *mat = cmt->mat;
+ ptrdiff_t mat_stride = cmt->mat_stride;
+ unsigned int var_idx = cmt->var_idx;
+ unsigned int idx_coeff = job_idx;
+
+ for (int idx_grid = 0; idx_grid < NB_COLLOC_POINTS(eq_ctx); idx_grid++) {
+ const int idx = idx_grid + NB_COLLOC_POINTS(eq_ctx) * idx_coeff;
+ double val = 0.0;
+
+ for (int i = 0; i < PSSOLVE_DIFF_ORDER_NB; i++)
+ val += eq_coeffs[i][idx_grid] * eq_ctx->basis_val[var_idx][i][idx];
+
+ mat[idx_grid + mat_stride * idx_coeff] = val;
+ }
+}
+
+static int lu_invert(const int N, double *mat, double *rhs, int *ipiv)
+{
+ char equed = 'N';
+ double cond, ferr, berr, rpivot;
+
+ double *mat_f, *x;
+ int ret = 0;
+
+#if 0
+ LAPACKE_dgesv(LAPACK_COL_MAJOR, N, 1,
+ mat, N, ipiv, rhs, N);
+ LAPACKE_dgetri(LAPACK_COL_MAJOR, N, mat, N, ipiv);
+#else
+ mat_f = malloc(SQR(N) * sizeof(*mat_f));
+ x = malloc(N * sizeof(*x));
+
+ //{
+ // int i, j;
+ // for (i = 0; i < N; i++) {
+ // for (j = 0; j < N; j++)
+ // fprintf(stderr, "%+#010.8g\t", mat[i + j * N]);
+ // fprintf(stderr, "\n");
+ // }
+ //}
+ //{
+ // double *mat_copy = malloc(SQR(N) * sizeof(double));
+ // double *svd = malloc(N * sizeof(double));
+ // double *rhs_copy = malloc(N * sizeof(double));
+ // int rank;
+
+ // memcpy(mat_copy, mat, SQR(N) * sizeof(double));
+ // memcpy(rhs_copy, rhs, N * sizeof(double));
+
+ // LAPACKE_dgelsd(LAPACK_COL_MAJOR, N, N, 1, mat_copy, N, rhs_copy, N,
+ // svd, 1e-13, &rank);
+
+ // free(mat_copy);
+ // for (int i = 0; i < N; i++) {
+ // if (i > 5 && i < N - 5)
+ // continue;
+
+ // fprintf(stderr, "%g\t", svd[i]);
+ // }
+ // fprintf(stderr, "\n rank %d\n", rank);
+ // free(svd);
+ // free(rhs_copy);
+
+ // if (rank < N)
+ // ret = 1;
+ //}
+
+ //LAPACKE_dgesv(LAPACK_COL_MAJOR, N, 1,
+ // mat, N, ipiv, rhs, N);
+ LAPACKE_dgesvx(LAPACK_COL_MAJOR, 'N', 'N', N, 1,
+ mat, N, mat_f, N, ipiv, &equed, NULL, NULL,
+ rhs, N, x, N, &cond, &ferr, &berr, &rpivot);
+ LAPACKE_dgetri(LAPACK_COL_MAJOR, N, mat_f, N, ipiv);
+ memcpy(rhs, x, N * sizeof(double));
+ memcpy(mat, mat_f, SQR(N) * sizeof(double));
+
+ fprintf(stderr, "LU factorization solution to a %zdx%zd matrix: "
+ "condition number %16.16g; forward error %16.16g backward error %16.16g\n",
+ N, N, cond, ferr, berr);
+
+ free(mat_f);
+ free(x);
+#endif
+
+ return ret;
+}
+
+int md_pssolve_solve(PSSolveContext *ctx,
+ const double *(**eq_coeffs)[PSSOLVE_DIFF_ORDER_NB],
+ const double *rhs, double *coeffs)
+{
+ PSSolvePriv *s = ctx->priv;
+ double rhs_max;
+ int64_t start;
+
+ int ret = 0;
+
+ /* fill the matrix */
+ start = gettime();
+
+ for (int i = 0; i < ctx->nb_equations; i++) {
+ PSEquationContext *eq_ctx = &s->eqs[i];
+ double *mat = s->eqs[i].mat;
+
+ for (int j = 0; j < ctx->nb_equations; j++) {
+ ConstructMatrixThread thread = {
+ .eq_ctx = eq_ctx,
+ .eq_coeffs = eq_coeffs[i][j],
+ .mat = mat,
+ .mat_stride = s->nb_coeffs,
+ .var_idx = j,
+ };
+ md_threadpool_execute(s->tp, NB_COEFFS(&s->eqs[j]), construct_matrix,
+ &thread);
+ mat += NB_COEFFS(&s->eqs[j]) * s->nb_coeffs;
+ }
+ }
+
+ ctx->construct_matrix_time += gettime() - start;
+ ctx->construct_matrix_count++;
+
+#if 0
+ if (rhs_max < EPS) {
+ fprintf(stderr, "zero rhs\n");
+ memset(ms->coeffs, 0, sizeof(*ms->coeffs) * ms->nb_coeffs);
+ if (ms->cl_queue) {
+ clEnqueueWriteBuffer(ms->cl_queue, ms->ocl_coeffs, 1, 0, N * sizeof(double),
+ ms->coeffs, 0, NULL, NULL);
+ }
+ return 0;
+ }
+#endif
+
+ /* solve for the coeffs */
+ if (s->steps_since_inverse < 1024) {
+ int64_t start;
+
+ start = gettime();
+
+ CCTK_TimerStart("MinimalDistortion_solve_BiCGSTAB");
+ ret = md_bicgstab_solve(s->bicgstab, s->mat, rhs, coeffs);
+ CCTK_TimerStop("MinimalDistortion_solve_BiCGSTAB");
+
+ if (ret >= 0) {
+ ctx->cg_time_total += gettime() - start;
+ ctx->cg_solve_count++;
+ ctx->cg_iter_count += ret + 1;
+ s->steps_since_inverse++;
+
+ }
+ } else
+ ret = -1;
+
+ if (ret < 0) {
+ int64_t start;
+
+ CCTK_TimerStart("MinimalDistortion_solve_LU");
+ start = gettime();
+
+ memcpy(coeffs, rhs, s->nb_coeffs * sizeof(*rhs));
+
+ ret = lu_invert(s->nb_coeffs, s->mat, coeffs, s->ipiv);
+ ctx->lu_solves_time += gettime() - start;
+ ctx->lu_solves_count++;
+ CCTK_TimerStop("MinimalDistortion_solve_LU");
+
+ ret = md_bicgstab_init(s->bicgstab, s->mat, coeffs);
+
+ s->steps_since_inverse = 0;
+ }
+
+ return ret;
+}
+
+static int basis_val_init(PSSolveContext *ctx, unsigned int eq_idx)
+{
+ PSSolvePriv *s = ctx->priv;
+ PSEquationContext *eq_ctx = &s->eqs[eq_idx];
+ int ret;
+
+ eq_ctx->basis_val = calloc(ctx->nb_equations, sizeof(*eq_ctx->basis_val));
+ if (!eq_ctx->basis_val)
+ return -ENOMEM;
+
+ for (int i = 0; i < ctx->nb_equations; i++) {
+ double *basis_val[2][3] = { { NULL } };
+
+ /* for each direction, compute the corresponding basis values/derivatives */
+ for (int dir = 0; dir < ARRAY_ELEMS(basis_val); dir++) {
+ for (int diff_order = 0; diff_order < ARRAY_ELEMS(basis_val[dir]); diff_order++) {
+ ret = posix_memalign((void**)&basis_val[dir][diff_order], 32,
+ sizeof(*basis_val[dir][diff_order]) * s->eqs[i].nb_coeffs[dir] * eq_ctx->nb_colloc_points[dir]);
+ if (ret) {
+ ret = -ENOMEM;
+ goto fail;
+ }
+ }
+
+ for (int k = 0; k < eq_ctx->nb_colloc_points[dir]; k++) {
+ double coord = ctx->colloc_grid[eq_idx][dir][k];
+ for (int l = 0; l < s->eqs[i].nb_coeffs[dir]; l++) {
+ basis_val[dir][0][k * s->eqs[i].nb_coeffs[dir] + l] = md_basis_eval(ctx->basis[i][dir], MD_BASIS_EVAL_TYPE_VALUE, coord, l);
+ basis_val[dir][1][k * s->eqs[i].nb_coeffs[dir] + l] = md_basis_eval(ctx->basis[i][dir], MD_BASIS_EVAL_TYPE_DIFF1, coord, l);
+ basis_val[dir][2][k * s->eqs[i].nb_coeffs[dir] + l] = md_basis_eval(ctx->basis[i][dir], MD_BASIS_EVAL_TYPE_DIFF2, coord, l);
+ }
+ }
+ }
+
+ for (int diff = 0; diff < ARRAY_ELEMS(eq_ctx->basis_val[i]); diff++) {
+ ret = posix_memalign((void**)&eq_ctx->basis_val[i][diff], 32,
+ NB_COLLOC_POINTS(eq_ctx) * NB_COEFFS(eq_ctx) * sizeof(*eq_ctx->basis_val[i][diff]));
+ if (ret) {
+ ret = -ENOMEM;
+ goto fail;
+ }
+ }
+
+ for (int j = 0; j < eq_ctx->nb_colloc_points[1]; j++) {
+ const double *basis1 = basis_val[1][0] + j * s->eqs[i].nb_coeffs[1];
+ const double *dbasis1 = basis_val[1][1] + j * s->eqs[i].nb_coeffs[1];
+ const double *d2basis1 = basis_val[1][2] + j * s->eqs[i].nb_coeffs[1];
+
+ for (int k = 0; k < eq_ctx->nb_colloc_points[0]; k++) {
+ const double *basis0 = basis_val[0][0] + k * s->eqs[i].nb_coeffs[0];
+ const double *dbasis0 = basis_val[0][1] + k * s->eqs[i].nb_coeffs[0];
+ const double *d2basis0 = basis_val[0][2] + k * s->eqs[i].nb_coeffs[0];
+
+ const int idx_grid = j * eq_ctx->nb_colloc_points[0] + k;
+
+ for (int l = 0; l < s->eqs[i].nb_coeffs[1]; l++)
+ for (int m = 0; m < s->eqs[i].nb_coeffs[0]; m++) {
+ const int idx_coeff = l * s->eqs[i].nb_coeffs[0] + m;
+ const int idx = idx_grid + NB_COLLOC_POINTS(eq_ctx) * idx_coeff;
+
+ eq_ctx->basis_val[i][PSSOLVE_DIFF_ORDER_00][idx] = basis0[m] * basis1[l];
+ eq_ctx->basis_val[i][PSSOLVE_DIFF_ORDER_10][idx] = dbasis0[m] * basis1[l];
+ eq_ctx->basis_val[i][PSSOLVE_DIFF_ORDER_01][idx] = basis0[m] * dbasis1[l];
+ eq_ctx->basis_val[i][PSSOLVE_DIFF_ORDER_20][idx] = d2basis0[m] * basis1[l];
+ eq_ctx->basis_val[i][PSSOLVE_DIFF_ORDER_02][idx] = basis0[m] * d2basis1[l];
+ eq_ctx->basis_val[i][PSSOLVE_DIFF_ORDER_11][idx] = dbasis0[m] * dbasis1[l];
+ }
+ }
+ }
+
+fail:
+ for (int dir = 0; dir < ARRAY_ELEMS(basis_val); dir++)
+ for (int diff = 0; diff < ARRAY_ELEMS(basis_val[dir]); diff++)
+ free(basis_val[dir][diff]);
+ if (ret < 0)
+ return ret;
+ }
+
+ return 0;
+}
+
+int md_pssolve_context_init(PSSolveContext *ctx)
+{
+ PSSolvePriv *s = ctx->priv;
+ size_t N = 0;
+
+ int ret = 0;
+
+ if (ctx->tp) {
+ s->tp = ctx->tp;
+ } else {
+ ret = md_threadpool_init(&s->tp_internal, 1);
+ if (ret < 0)
+ return ret;
+ s->tp = s->tp_internal;
+ }
+
+ /* sanity check the parameters */
+ for (int i = 0; i < ctx->nb_equations; i++) {
+ if (!ctx->basis[i][0] || !ctx->basis[i][1]) {
+ fprintf(stderr, "Basis set for variable %d not set\n", i);
+ return -EINVAL;
+ }
+ if (!ctx->solve_order[i][0] || !ctx->solve_order[i][1]) {
+ fprintf(stderr, "Solver order for variable %d not set\n", i);
+ return -EINVAL;
+ }
+
+ N += ctx->solve_order[i][0] * ctx->solve_order[i][1];
+ }
+
+ ret = posix_memalign((void**)&s->ipiv, 32, sizeof(*s->ipiv) * N);
+ ret |= posix_memalign((void**)&s->mat, 32, sizeof(*s->mat) * N * N);
+ if (ret)
+ return -ENOMEM;
+
+ s->nb_coeffs = N;
+
+ ctx->colloc_grid = calloc(ctx->nb_equations, sizeof(*ctx->colloc_grid));
+ if (!ctx->colloc_grid)
+ return -ENOMEM;
+
+ /* initialize the per-equation state */
+ for (int i = 0; i < ctx->nb_equations; i++) {
+ PSEquationContext *eq_ctx = &s->eqs[i];
+
+ eq_ctx->nb_coeffs[0] = ctx->solve_order[i][0];
+ eq_ctx->nb_coeffs[1] = ctx->solve_order[i][1];
+ eq_ctx->nb_colloc_points[0] = ctx->solve_order[i][0];
+ eq_ctx->nb_colloc_points[1] = ctx->solve_order[i][1];
+ eq_ctx->colloc_grid_order[0] = ctx->solve_order[i][0];
+ eq_ctx->colloc_grid_order[1] = ctx->solve_order[i][1];
+
+ if (i == 0)
+ eq_ctx->mat = s->mat;
+ else
+ eq_ctx->mat = s->eqs[i - 1].mat + NB_COLLOC_POINTS(&s->eqs[i - 1]);
+
+ /* compute the collocation grid */
+ posix_memalign((void**)&ctx->colloc_grid[i][0], 32, eq_ctx->nb_colloc_points[0] * sizeof(*ctx->colloc_grid[i][0]));
+ posix_memalign((void**)&ctx->colloc_grid[i][1], 32, eq_ctx->nb_colloc_points[1] * sizeof(*ctx->colloc_grid[i][1]));
+ if (!ctx->colloc_grid[i][0] || !ctx->colloc_grid[i][1])
+ return -ENOMEM;
+
+ for (int j = 0; j < eq_ctx->nb_colloc_points[0]; j++)
+ ctx->colloc_grid[i][0][j] = md_basis_colloc_point(ctx->basis[i][0], eq_ctx->colloc_grid_order[0], j);
+ for (int j = 0; j < eq_ctx->nb_colloc_points[1]; j++)
+ ctx->colloc_grid[i][1][j] = md_basis_colloc_point(ctx->basis[i][1], eq_ctx->colloc_grid_order[1], j);
+
+ }
+
+ /* precompute the basis values we will need */
+ for (int i = 0; i < ctx->nb_equations; i++) {
+ ret = basis_val_init(ctx, i);
+ if (ret < 0)
+ return ret;
+ }
+
+ s->steps_since_inverse = INT_MAX;
+
+ /* init the BiCGStab solver */
+ ret = md_bicgstab_context_alloc(&s->bicgstab, N, ctx->ocl_ctx, ctx->ocl_queue);
+ if (ret < 0)
+ return ret;
+
+ return 0;
+}
+
+int md_pssolve_context_alloc(PSSolveContext **pctx, unsigned int nb_equations)
+{
+ PSSolveContext *ctx;
+
+ if (!nb_equations)
+ return -EINVAL;
+
+ ctx = calloc(1, sizeof(*ctx));
+ if (!ctx)
+ return -ENOMEM;
+
+ ctx->nb_equations = nb_equations;
+
+ ctx->priv = calloc(1, sizeof(*ctx->priv));
+ if (!ctx->priv)
+ goto fail;
+
+ ctx->priv->eqs = calloc(nb_equations, sizeof(*ctx->priv->eqs));
+ if (!ctx->priv->eqs)
+ goto fail;
+
+ ctx->basis = calloc(nb_equations, sizeof(*ctx->basis));
+ if (!ctx->basis)
+ goto fail;
+
+ ctx->solve_order = calloc(nb_equations, sizeof(*ctx->solve_order));
+ if (!ctx->solve_order)
+ goto fail;
+
+ *pctx = ctx;
+ return 0;
+fail:
+ md_pssolve_context_free(&ctx);
+ return -ENOMEM;
+}
+
+void md_pssolve_context_free(PSSolveContext **pctx)
+{
+ PSSolveContext *ctx = *pctx;
+
+ if (!ctx)
+ return;
+
+ if (ctx->priv) {
+ if (ctx->priv->eqs) {
+ for (int i = 0; i < ctx->nb_equations; i++) {
+ PSEquationContext *eq_ctx = &ctx->priv->eqs[i];
+
+ for (int j = 0; j < ctx->nb_equations; j++)
+ for (int k = 0; k < ARRAY_ELEMS(eq_ctx->basis_val[j]); k++)
+ free(eq_ctx->basis_val[j][k]);
+ free(eq_ctx->basis_val);
+ }
+ }
+
+ free(ctx->priv->eqs);
+
+ free(ctx->priv->ipiv);
+ free(ctx->priv->mat);
+
+ md_bicgstab_context_free(&ctx->priv->bicgstab);
+ md_threadpool_free(&ctx->priv->tp_internal);
+ }
+
+ free(ctx->priv);
+
+ if (ctx->colloc_grid) {
+ for (int i = 0; i < ctx->nb_equations; i++)
+ for (int j = 0; j < ARRAY_ELEMS(ctx->colloc_grid[i]); j++)
+ free(ctx->colloc_grid[i][j]);
+ }
+
+ free(ctx->colloc_grid[0]);
+ free(ctx->colloc_grid[1]);
+
+ free(ctx->basis);
+ free(ctx->solve_order);
+
+ free(ctx);
+ *pctx = NULL;
+}
diff --git a/src/pssolve.h b/src/pssolve.h
new file mode 100644
index 0000000..e6a4c1a
--- /dev/null
+++ b/src/pssolve.h
@@ -0,0 +1,139 @@
+/*
+ * Pseudospectral 2nd order 2D linear PDE solver
+ * Copyright (C) 2016 Anton Khirnov <anton@khirnov.net>
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef MD_PSSOLVE_H
+#define MD_PSSOLVE_H
+
+#include "common.h"
+
+#if HAVE_OPENCL
+#include <cl.h>
+#else
+typedef void* cl_context;
+typedef void* cl_command_queue;
+#endif
+
+#include <stdint.h>
+
+#include "basis.h"
+#include "threadpool.h"
+
+enum PSSolveDiffOrder {
+ PSSOLVE_DIFF_ORDER_00,
+ PSSOLVE_DIFF_ORDER_10,
+ PSSOLVE_DIFF_ORDER_01,
+ PSSOLVE_DIFF_ORDER_11,
+ PSSOLVE_DIFF_ORDER_20,
+ PSSOLVE_DIFF_ORDER_02,
+ PSSOLVE_DIFF_ORDER_NB,
+};
+
+typedef struct PSSolvePriv PSSolvePriv;
+
+typedef struct PSSolveContext {
+ /**
+ * Solver private data, not to be touched by the caller.
+ */
+ PSSolvePriv *priv;
+
+ /**
+ * Number of equations/unknown functions in the set.
+ * Set by md_pssolve_context_alloc().
+ */
+ unsigned int nb_equations;
+
+ /**
+ * The basis sets.
+ *
+ * basis[i][j] is the basis set used for i-th variable in j-th direction.
+ *
+ * The array is allocated by md_pssolve_context_alloc(), must be filled by
+ * by the caller before md_pssolve_context_init().
+ */
+ const MDBasisSetContext *(*basis)[2];
+
+ /**
+ * Order of the solver.
+ *
+ * solve_order[i][j] is the order of the solver (i.e. the number of the
+ * basis functions used) for i-th variable in j-th direction.
+ *
+ * Allocated by md_pssolve_context_alloc(), must be filled by the caller
+ * before md_pssolve_context_init().
+ */
+ unsigned int (*solve_order)[2];
+
+ /**
+ * Locations of the collocation points. The equation coefficients passed to
+ * md_pssolve_solve() should be evaluated at those grid positions.
+ *
+ * colloc_grid[i][j] is an array of length solve_order[i][j] and contains
+ * the collocation points for the i-th variable in the j-th direction.
+ *
+ * Set by the solver after md_pssolve_context_init().
+ */
+ double *(*colloc_grid)[2];
+
+ /**
+ * The thread pool used for multithreaded execution. May be set by the
+ * caller before md_pssolve_context_init(), otherwise a single thread will
+ * be used.
+ */
+ ThreadPoolContext *tp;
+
+ cl_context ocl_ctx;
+ cl_command_queue ocl_queue;
+
+ uint64_t lu_solves_count;
+ uint64_t lu_solves_time;
+
+ uint64_t cg_solve_count;
+ uint64_t cg_iter_count;
+ uint64_t cg_time_total;
+
+ uint64_t construct_matrix_count;
+ uint64_t construct_matrix_time;
+} PSSolveContext;
+
+/**
+ * Allocate a new solver.
+ */
+int md_pssolve_context_alloc(PSSolveContext **ctx, unsigned int nb_equations);
+
+/**
+ * Initialize the solver for use after all the context options have been set.
+ */
+int md_pssolve_context_init(PSSolveContext *ctx);
+
+/**
+ * Free the solver and all its internal state.
+ */
+void md_pssolve_context_free(PSSolveContext **ctx);
+
+/**
+ * Solve a second order linear PDE in 2D with a pseudospectral method.
+ *
+ * @param eq_coeffs the equation coefficients.
+ * @param rhs the right-hand side of the equation at the collocation points.
+ * @param coeffs the spectral coefficients of the solution will be written here.
+ */
+int md_pssolve_solve(PSSolveContext *ctx,
+ const double *(**eq_coeffs)[PSSOLVE_DIFF_ORDER_NB],
+ const double *rhs, double *coeffs);
+
+#endif /* MD_PSSOLVE_H */
diff --git a/src/register.c b/src/register.c
new file mode 100644
index 0000000..64b47ce
--- /dev/null
+++ b/src/register.c
@@ -0,0 +1,7 @@
+void minimal_distortion_axi_register_mol(CCTK_ARGUMENTS)
+{
+ MoLRegisterConstrained(CCTK_VarIndex("ML_BSSN::beta1"));
+ MoLRegisterConstrained(CCTK_VarIndex("ML_BSSN::beta2"));
+ MoLRegisterConstrained(CCTK_VarIndex("ML_BSSN::beta3"));
+}
+
diff --git a/src/threadpool.c b/src/threadpool.c
new file mode 100644
index 0000000..2febdcb
--- /dev/null
+++ b/src/threadpool.c
@@ -0,0 +1,174 @@
+/*
+ * Copyright 2016 Anton Khirnov <anton@khirnov.net>
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <errno.h>
+#include <pthread.h>
+#include <stdlib.h>
+
+#include "threadpool.h"
+
+typedef struct WorkerContext {
+ ThreadPoolContext *parent;
+ pthread_t thread;
+ unsigned int idx;
+} WorkerContext;
+
+struct ThreadPoolContext {
+ WorkerContext *workers;
+ unsigned int nb_workers;
+
+ pthread_mutex_t mutex;
+ pthread_cond_t cond;
+ void (*func)(void *arg,
+ unsigned int job_idx, unsigned int nb_jobs,
+ unsigned int thread_idx, unsigned int nb_threads);
+ void *func_arg;
+ int next_job;
+ int nb_jobs;
+ int nb_jobs_finished;
+
+ int finish;
+};
+
+void *worker_thread(void *arg)
+{
+ WorkerContext *w = arg;
+ ThreadPoolContext *ctx = w->parent;
+ int nb_jobs, job_idx;
+
+ while (1) {
+ pthread_mutex_lock(&ctx->mutex);
+ while (!ctx->finish && ctx->next_job >= ctx->nb_jobs)
+ pthread_cond_wait(&ctx->cond, &ctx->mutex);
+
+ if (ctx->finish) {
+ pthread_mutex_unlock(&ctx->mutex);
+ break;
+ }
+
+ nb_jobs = ctx->nb_jobs;
+ job_idx = ctx->next_job++;
+
+ pthread_mutex_unlock(&ctx->mutex);
+
+ ctx->func(ctx->func_arg, job_idx, nb_jobs, w->idx, ctx->nb_workers);
+
+ pthread_mutex_lock(&ctx->mutex);
+
+ ctx->nb_jobs_finished++;
+
+ pthread_cond_broadcast(&ctx->cond);
+ pthread_mutex_unlock(&ctx->mutex);
+ }
+ return NULL;
+}
+
+int md_threadpool_init(ThreadPoolContext **pctx, unsigned int nb_threads)
+{
+ ThreadPoolContext *ctx;
+ int ret = 0;
+
+ if (!nb_threads)
+ return -ENOSYS;
+
+ ctx = calloc(1, sizeof(*ctx));
+ if (!ctx)
+ return -ENOMEM;
+
+ pthread_mutex_init(&ctx->mutex, NULL);
+ pthread_cond_init(&ctx->cond, NULL);
+
+ ctx->workers = calloc(nb_threads, sizeof(*ctx->workers));
+ if (!ctx->workers) {
+ ret = -ENOMEM;
+ goto fail;
+ }
+
+ for (int i = 0; i < nb_threads; i++) {
+ WorkerContext *w = &ctx->workers[i];
+
+ w->idx = i;
+ w->parent = ctx;
+
+ ret = pthread_create(&w->thread, NULL, worker_thread, w);
+ if (ret) {
+ ret = -ret;
+ goto fail;
+ }
+
+ ctx->nb_workers++;
+ }
+
+
+ *pctx = ctx;
+ return 0;
+fail:
+ md_threadpool_free(&ctx);
+ return ret;
+}
+
+void md_threadpool_free(ThreadPoolContext **pctx)
+{
+ ThreadPoolContext *ctx = *pctx;
+
+ if (!ctx)
+ return;
+
+ pthread_mutex_lock(&ctx->mutex);
+ ctx->finish = 1;
+ pthread_cond_broadcast(&ctx->cond);
+ pthread_mutex_unlock(&ctx->mutex);
+
+
+ for (int i = 0; i < ctx->nb_workers; i++) {
+ WorkerContext *w = &ctx->workers[i];
+ pthread_join(w->thread, NULL);
+ }
+
+ pthread_mutex_destroy(&ctx->mutex);
+ pthread_cond_destroy(&ctx->cond);
+
+ free(ctx->workers);
+
+ free(ctx);
+ *pctx = NULL;
+}
+
+void md_threadpool_execute(ThreadPoolContext *ctx, unsigned int nb_jobs,
+ void (*func)(void *arg,
+ unsigned int job_idx, unsigned int nb_jobs,
+ unsigned int thread_idx, unsigned int nb_threads),
+ void *arg)
+{
+ pthread_mutex_lock(&ctx->mutex);
+
+ ctx->func = func;
+ ctx->func_arg = arg;
+
+ ctx->nb_jobs = nb_jobs;
+ ctx->nb_jobs_finished = 0;
+ ctx->next_job = 0;
+
+ pthread_cond_broadcast(&ctx->cond);
+ while (ctx->nb_jobs_finished < ctx->nb_jobs)
+ pthread_cond_wait(&ctx->cond, &ctx->mutex);
+
+ ctx->func = NULL;
+ ctx->func_arg = NULL;
+
+ pthread_mutex_unlock(&ctx->mutex);
+}
diff --git a/src/threadpool.h b/src/threadpool.h
new file mode 100644
index 0000000..0f6896d
--- /dev/null
+++ b/src/threadpool.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright 2016 Anton Khirnov <anton@khirnov.net>
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef MD_THREADPOOL_H
+#define MD_THREADPOOL_H
+
+typedef struct ThreadPoolContext ThreadPoolContext;
+
+int md_threadpool_init(ThreadPoolContext **ctx, unsigned int nb_threads);
+void md_threadpool_free(ThreadPoolContext **ctx);
+
+void md_threadpool_execute(ThreadPoolContext *ctx, unsigned int nb_jobs,
+ void (*func)(void *arg,
+ unsigned int job_idx, unsigned int nb_jobs,
+ unsigned int thread_idx, unsigned int nb_threads),
+ void *arg);
+
+#endif /* MD_THREADPOOL_H */
diff --git a/src/x86inc.asm b/src/x86inc.asm
new file mode 100644
index 0000000..dca1f78
--- /dev/null
+++ b/src/x86inc.asm
@@ -0,0 +1,1544 @@
+;*****************************************************************************
+;* x86inc.asm: x264asm abstraction layer
+;*****************************************************************************
+;* Copyright (C) 2005-2016 x264 project
+;*
+;* Authors: Loren Merritt <lorenm@u.washington.edu>
+;* Anton Mitrofanov <BugMaster@narod.ru>
+;* Fiona Glaser <fiona@x264.com>
+;* Henrik Gramner <henrik@gramner.com>
+;*
+;* Permission to use, copy, modify, and/or distribute this software for any
+;* purpose with or without fee is hereby granted, provided that the above
+;* copyright notice and this permission notice appear in all copies.
+;*
+;* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+;* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+;* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+;* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+;* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+;* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+;*****************************************************************************
+
+; This is a header file for the x264ASM assembly language, which uses
+; NASM/YASM syntax combined with a large number of macros to provide easy
+; abstraction between different calling conventions (x86_32, win64, linux64).
+; It also has various other useful features to simplify writing the kind of
+; DSP functions that are most often used in x264.
+
+; Unlike the rest of x264, this file is available under an ISC license, as it
+; has significant usefulness outside of x264 and we want it to be available
+; to the largest audience possible. Of course, if you modify it for your own
+; purposes to add a new feature, we strongly encourage contributing a patch
+; as this feature might be useful for others as well. Send patches or ideas
+; to x264-devel@videolan.org .
+
+%ifndef private_prefix
+ %define private_prefix x264
+%endif
+
+%ifndef public_prefix
+ %define public_prefix private_prefix
+%endif
+
+%if HAVE_ALIGNED_STACK
+ %define STACK_ALIGNMENT 16
+%endif
+%ifndef STACK_ALIGNMENT
+ %if ARCH_X86_64
+ %define STACK_ALIGNMENT 16
+ %else
+ %define STACK_ALIGNMENT 4
+ %endif
+%endif
+
+%define WIN64 0
+%define UNIX64 0
+%if ARCH_X86_64
+ %ifidn __OUTPUT_FORMAT__,win32
+ %define WIN64 1
+ %elifidn __OUTPUT_FORMAT__,win64
+ %define WIN64 1
+ %elifidn __OUTPUT_FORMAT__,x64
+ %define WIN64 1
+ %else
+ %define UNIX64 1
+ %endif
+%endif
+
+%define FORMAT_ELF 0
+%ifidn __OUTPUT_FORMAT__,elf
+ %define FORMAT_ELF 1
+%elifidn __OUTPUT_FORMAT__,elf32
+ %define FORMAT_ELF 1
+%elifidn __OUTPUT_FORMAT__,elf64
+ %define FORMAT_ELF 1
+%endif
+
+%ifdef PREFIX
+ %define mangle(x) _ %+ x
+%else
+ %define mangle(x) x
+%endif
+
+; aout does not support align=
+; NOTE: This section is out of sync with x264, in order to
+; keep supporting OS/2.
+%macro SECTION_RODATA 0-1 16
+ %ifidn __OUTPUT_FORMAT__,aout
+ section .text
+ %else
+ SECTION .rodata align=%1
+ %endif
+%endmacro
+
+%if WIN64
+ %define PIC
+%elif ARCH_X86_64 == 0
+; x86_32 doesn't require PIC.
+; Some distros prefer shared objects to be PIC, but nothing breaks if
+; the code contains a few textrels, so we'll skip that complexity.
+ %undef PIC
+%endif
+%ifdef PIC
+ default rel
+%endif
+
+%macro CPUNOP 1
+ %if HAVE_CPUNOP
+ CPU %1
+ %endif
+%endmacro
+
+; Macros to eliminate most code duplication between x86_32 and x86_64:
+; Currently this works only for leaf functions which load all their arguments
+; into registers at the start, and make no other use of the stack. Luckily that
+; covers most of x264's asm.
+
+; PROLOGUE:
+; %1 = number of arguments. loads them from stack if needed.
+; %2 = number of registers used. pushes callee-saved regs if needed.
+; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed.
+; %4 = (optional) stack size to be allocated. The stack will be aligned before
+; allocating the specified stack size. If the required stack alignment is
+; larger than the known stack alignment the stack will be manually aligned
+; and an extra register will be allocated to hold the original stack
+; pointer (to not invalidate r0m etc.). To prevent the use of an extra
+; register as stack pointer, request a negative stack size.
+; %4+/%5+ = list of names to define to registers
+; PROLOGUE can also be invoked by adding the same options to cglobal
+
+; e.g.
+; cglobal foo, 2,3,7,0x40, dst, src, tmp
+; declares a function (foo) that automatically loads two arguments (dst and
+; src) into registers, uses one additional register (tmp) plus 7 vector
+; registers (m0-m6) and allocates 0x40 bytes of stack space.
+
+; TODO Some functions can use some args directly from the stack. If they're the
+; last args then you can just not declare them, but if they're in the middle
+; we need more flexible macro.
+
+; RET:
+; Pops anything that was pushed by PROLOGUE, and returns.
+
+; REP_RET:
+; Use this instead of RET if it's a branch target.
+
+; registers:
+; rN and rNq are the native-size register holding function argument N
+; rNd, rNw, rNb are dword, word, and byte size
+; rNh is the high 8 bits of the word size
+; rNm is the original location of arg N (a register or on the stack), dword
+; rNmp is native size
+
+%macro DECLARE_REG 2-3
+ %define r%1q %2
+ %define r%1d %2d
+ %define r%1w %2w
+ %define r%1b %2b
+ %define r%1h %2h
+ %define %2q %2
+ %if %0 == 2
+ %define r%1m %2d
+ %define r%1mp %2
+ %elif ARCH_X86_64 ; memory
+ %define r%1m [rstk + stack_offset + %3]
+ %define r%1mp qword r %+ %1 %+ m
+ %else
+ %define r%1m [rstk + stack_offset + %3]
+ %define r%1mp dword r %+ %1 %+ m
+ %endif
+ %define r%1 %2
+%endmacro
+
+%macro DECLARE_REG_SIZE 3
+ %define r%1q r%1
+ %define e%1q r%1
+ %define r%1d e%1
+ %define e%1d e%1
+ %define r%1w %1
+ %define e%1w %1
+ %define r%1h %3
+ %define e%1h %3
+ %define r%1b %2
+ %define e%1b %2
+ %if ARCH_X86_64 == 0
+ %define r%1 e%1
+ %endif
+%endmacro
+
+DECLARE_REG_SIZE ax, al, ah
+DECLARE_REG_SIZE bx, bl, bh
+DECLARE_REG_SIZE cx, cl, ch
+DECLARE_REG_SIZE dx, dl, dh
+DECLARE_REG_SIZE si, sil, null
+DECLARE_REG_SIZE di, dil, null
+DECLARE_REG_SIZE bp, bpl, null
+
+; t# defines for when per-arch register allocation is more complex than just function arguments
+
+%macro DECLARE_REG_TMP 1-*
+ %assign %%i 0
+ %rep %0
+ CAT_XDEFINE t, %%i, r%1
+ %assign %%i %%i+1
+ %rotate 1
+ %endrep
+%endmacro
+
+%macro DECLARE_REG_TMP_SIZE 0-*
+ %rep %0
+ %define t%1q t%1 %+ q
+ %define t%1d t%1 %+ d
+ %define t%1w t%1 %+ w
+ %define t%1h t%1 %+ h
+ %define t%1b t%1 %+ b
+ %rotate 1
+ %endrep
+%endmacro
+
+DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
+
+%if ARCH_X86_64
+ %define gprsize 8
+%else
+ %define gprsize 4
+%endif
+
+%macro PUSH 1
+ push %1
+ %ifidn rstk, rsp
+ %assign stack_offset stack_offset+gprsize
+ %endif
+%endmacro
+
+%macro POP 1
+ pop %1
+ %ifidn rstk, rsp
+ %assign stack_offset stack_offset-gprsize
+ %endif
+%endmacro
+
+%macro PUSH_IF_USED 1-*
+ %rep %0
+ %if %1 < regs_used
+ PUSH r%1
+ %endif
+ %rotate 1
+ %endrep
+%endmacro
+
+%macro POP_IF_USED 1-*
+ %rep %0
+ %if %1 < regs_used
+ pop r%1
+ %endif
+ %rotate 1
+ %endrep
+%endmacro
+
+%macro LOAD_IF_USED 1-*
+ %rep %0
+ %if %1 < num_args
+ mov r%1, r %+ %1 %+ mp
+ %endif
+ %rotate 1
+ %endrep
+%endmacro
+
+%macro SUB 2
+ sub %1, %2
+ %ifidn %1, rstk
+ %assign stack_offset stack_offset+(%2)
+ %endif
+%endmacro
+
+%macro ADD 2
+ add %1, %2
+ %ifidn %1, rstk
+ %assign stack_offset stack_offset-(%2)
+ %endif
+%endmacro
+
+%macro movifnidn 2
+ %ifnidn %1, %2
+ mov %1, %2
+ %endif
+%endmacro
+
+%macro movsxdifnidn 2
+ %ifnidn %1, %2
+ movsxd %1, %2
+ %endif
+%endmacro
+
+%macro ASSERT 1
+ %if (%1) == 0
+ %error assertion ``%1'' failed
+ %endif
+%endmacro
+
+%macro DEFINE_ARGS 0-*
+ %ifdef n_arg_names
+ %assign %%i 0
+ %rep n_arg_names
+ CAT_UNDEF arg_name %+ %%i, q
+ CAT_UNDEF arg_name %+ %%i, d
+ CAT_UNDEF arg_name %+ %%i, w
+ CAT_UNDEF arg_name %+ %%i, h
+ CAT_UNDEF arg_name %+ %%i, b
+ CAT_UNDEF arg_name %+ %%i, m
+ CAT_UNDEF arg_name %+ %%i, mp
+ CAT_UNDEF arg_name, %%i
+ %assign %%i %%i+1
+ %endrep
+ %endif
+
+ %xdefine %%stack_offset stack_offset
+ %undef stack_offset ; so that the current value of stack_offset doesn't get baked in by xdefine
+ %assign %%i 0
+ %rep %0
+ %xdefine %1q r %+ %%i %+ q
+ %xdefine %1d r %+ %%i %+ d
+ %xdefine %1w r %+ %%i %+ w
+ %xdefine %1h r %+ %%i %+ h
+ %xdefine %1b r %+ %%i %+ b
+ %xdefine %1m r %+ %%i %+ m
+ %xdefine %1mp r %+ %%i %+ mp
+ CAT_XDEFINE arg_name, %%i, %1
+ %assign %%i %%i+1
+ %rotate 1
+ %endrep
+ %xdefine stack_offset %%stack_offset
+ %assign n_arg_names %0
+%endmacro
+
+%define required_stack_alignment ((mmsize + 15) & ~15)
+
+%macro ALLOC_STACK 1-2 0 ; stack_size, n_xmm_regs (for win64 only)
+ %ifnum %1
+ %if %1 != 0
+ %assign %%pad 0
+ %assign stack_size %1
+ %if stack_size < 0
+ %assign stack_size -stack_size
+ %endif
+ %if WIN64
+ %assign %%pad %%pad + 32 ; shadow space
+ %if mmsize != 8
+ %assign xmm_regs_used %2
+ %if xmm_regs_used > 8
+ %assign %%pad %%pad + (xmm_regs_used-8)*16 ; callee-saved xmm registers
+ %endif
+ %endif
+ %endif
+ %if required_stack_alignment <= STACK_ALIGNMENT
+ ; maintain the current stack alignment
+ %assign stack_size_padded stack_size + %%pad + ((-%%pad-stack_offset-gprsize) & (STACK_ALIGNMENT-1))
+ SUB rsp, stack_size_padded
+ %else
+ %assign %%reg_num (regs_used - 1)
+ %xdefine rstk r %+ %%reg_num
+ ; align stack, and save original stack location directly above
+ ; it, i.e. in [rsp+stack_size_padded], so we can restore the
+ ; stack in a single instruction (i.e. mov rsp, rstk or mov
+ ; rsp, [rsp+stack_size_padded])
+ %if %1 < 0 ; need to store rsp on stack
+ %xdefine rstkm [rsp + stack_size + %%pad]
+ %assign %%pad %%pad + gprsize
+ %else ; can keep rsp in rstk during whole function
+ %xdefine rstkm rstk
+ %endif
+ %assign stack_size_padded stack_size + ((%%pad + required_stack_alignment-1) & ~(required_stack_alignment-1))
+ mov rstk, rsp
+ and rsp, ~(required_stack_alignment-1)
+ sub rsp, stack_size_padded
+ movifnidn rstkm, rstk
+ %endif
+ WIN64_PUSH_XMM
+ %endif
+ %endif
+%endmacro
+
+%macro SETUP_STACK_POINTER 1
+ %ifnum %1
+ %if %1 != 0 && required_stack_alignment > STACK_ALIGNMENT
+ %if %1 > 0
+ %assign regs_used (regs_used + 1)
+ %endif
+ %if ARCH_X86_64 && regs_used < 5 + UNIX64 * 3
+ ; Ensure that we don't clobber any registers containing arguments. For UNIX64 we also preserve r6 (rax)
+ ; since it's used as a hidden argument in vararg functions to specify the number of vector registers used.
+ %assign regs_used 5 + UNIX64 * 3
+ %endif
+ %endif
+ %endif
+%endmacro
+
+%macro DEFINE_ARGS_INTERNAL 3+
+ %ifnum %2
+ DEFINE_ARGS %3
+ %elif %1 == 4
+ DEFINE_ARGS %2
+ %elif %1 > 4
+ DEFINE_ARGS %2, %3
+ %endif
+%endmacro
+
+%if WIN64 ; Windows x64 ;=================================================
+
+DECLARE_REG 0, rcx
+DECLARE_REG 1, rdx
+DECLARE_REG 2, R8
+DECLARE_REG 3, R9
+DECLARE_REG 4, R10, 40
+DECLARE_REG 5, R11, 48
+DECLARE_REG 6, rax, 56
+DECLARE_REG 7, rdi, 64
+DECLARE_REG 8, rsi, 72
+DECLARE_REG 9, rbx, 80
+DECLARE_REG 10, rbp, 88
+DECLARE_REG 11, R12, 96
+DECLARE_REG 12, R13, 104
+DECLARE_REG 13, R14, 112
+DECLARE_REG 14, R15, 120
+
+%macro PROLOGUE 2-5+ 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
+ %assign num_args %1
+ %assign regs_used %2
+ ASSERT regs_used >= num_args
+ SETUP_STACK_POINTER %4
+ ASSERT regs_used <= 15
+ PUSH_IF_USED 7, 8, 9, 10, 11, 12, 13, 14
+ ALLOC_STACK %4, %3
+ %if mmsize != 8 && stack_size == 0
+ WIN64_SPILL_XMM %3
+ %endif
+ LOAD_IF_USED 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14
+ DEFINE_ARGS_INTERNAL %0, %4, %5
+%endmacro
+
+%macro WIN64_PUSH_XMM 0
+ ; Use the shadow space to store XMM6 and XMM7, the rest needs stack space allocated.
+ %if xmm_regs_used > 6
+ movaps [rstk + stack_offset + 8], xmm6
+ %endif
+ %if xmm_regs_used > 7
+ movaps [rstk + stack_offset + 24], xmm7
+ %endif
+ %if xmm_regs_used > 8
+ %assign %%i 8
+ %rep xmm_regs_used-8
+ movaps [rsp + (%%i-8)*16 + stack_size + 32], xmm %+ %%i
+ %assign %%i %%i+1
+ %endrep
+ %endif
+%endmacro
+
+%macro WIN64_SPILL_XMM 1
+ %assign xmm_regs_used %1
+ ASSERT xmm_regs_used <= 16
+ %if xmm_regs_used > 8
+ ; Allocate stack space for callee-saved xmm registers plus shadow space and align the stack.
+ %assign %%pad (xmm_regs_used-8)*16 + 32
+ %assign stack_size_padded %%pad + ((-%%pad-stack_offset-gprsize) & (STACK_ALIGNMENT-1))
+ SUB rsp, stack_size_padded
+ %endif
+ WIN64_PUSH_XMM
+%endmacro
+
+%macro WIN64_RESTORE_XMM_INTERNAL 1
+ %assign %%pad_size 0
+ %if xmm_regs_used > 8
+ %assign %%i xmm_regs_used
+ %rep xmm_regs_used-8
+ %assign %%i %%i-1
+ movaps xmm %+ %%i, [%1 + (%%i-8)*16 + stack_size + 32]
+ %endrep
+ %endif
+ %if stack_size_padded > 0
+ %if stack_size > 0 && required_stack_alignment > STACK_ALIGNMENT
+ mov rsp, rstkm
+ %else
+ add %1, stack_size_padded
+ %assign %%pad_size stack_size_padded
+ %endif
+ %endif
+ %if xmm_regs_used > 7
+ movaps xmm7, [%1 + stack_offset - %%pad_size + 24]
+ %endif
+ %if xmm_regs_used > 6
+ movaps xmm6, [%1 + stack_offset - %%pad_size + 8]
+ %endif
+%endmacro
+
+%macro WIN64_RESTORE_XMM 1
+ WIN64_RESTORE_XMM_INTERNAL %1
+ %assign stack_offset (stack_offset-stack_size_padded)
+ %assign xmm_regs_used 0
+%endmacro
+
+%define has_epilogue regs_used > 7 || xmm_regs_used > 6 || mmsize == 32 || stack_size > 0
+
+%macro RET 0
+ WIN64_RESTORE_XMM_INTERNAL rsp
+ POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7
+ %if mmsize == 32
+ vzeroupper
+ %endif
+ AUTO_REP_RET
+%endmacro
+
+%elif ARCH_X86_64 ; *nix x64 ;=============================================
+
+DECLARE_REG 0, rdi
+DECLARE_REG 1, rsi
+DECLARE_REG 2, rdx
+DECLARE_REG 3, rcx
+DECLARE_REG 4, R8
+DECLARE_REG 5, R9
+DECLARE_REG 6, rax, 8
+DECLARE_REG 7, R10, 16
+DECLARE_REG 8, R11, 24
+DECLARE_REG 9, rbx, 32
+DECLARE_REG 10, rbp, 40
+DECLARE_REG 11, R12, 48
+DECLARE_REG 12, R13, 56
+DECLARE_REG 13, R14, 64
+DECLARE_REG 14, R15, 72
+
+%macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
+ %assign num_args %1
+ %assign regs_used %2
+ ASSERT regs_used >= num_args
+ SETUP_STACK_POINTER %4
+ ASSERT regs_used <= 15
+ PUSH_IF_USED 9, 10, 11, 12, 13, 14
+ ALLOC_STACK %4
+ LOAD_IF_USED 6, 7, 8, 9, 10, 11, 12, 13, 14
+ DEFINE_ARGS_INTERNAL %0, %4, %5
+%endmacro
+
+%define has_epilogue regs_used > 9 || mmsize == 32 || stack_size > 0
+
+%macro RET 0
+ %if stack_size_padded > 0
+ %if required_stack_alignment > STACK_ALIGNMENT
+ mov rsp, rstkm
+ %else
+ add rsp, stack_size_padded
+ %endif
+ %endif
+ POP_IF_USED 14, 13, 12, 11, 10, 9
+ %if mmsize == 32
+ vzeroupper
+ %endif
+ AUTO_REP_RET
+%endmacro
+
+%else ; X86_32 ;==============================================================
+
+DECLARE_REG 0, eax, 4
+DECLARE_REG 1, ecx, 8
+DECLARE_REG 2, edx, 12
+DECLARE_REG 3, ebx, 16
+DECLARE_REG 4, esi, 20
+DECLARE_REG 5, edi, 24
+DECLARE_REG 6, ebp, 28
+%define rsp esp
+
+%macro DECLARE_ARG 1-*
+ %rep %0
+ %define r%1m [rstk + stack_offset + 4*%1 + 4]
+ %define r%1mp dword r%1m
+ %rotate 1
+ %endrep
+%endmacro
+
+DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
+
+%macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
+ %assign num_args %1
+ %assign regs_used %2
+ ASSERT regs_used >= num_args
+ %if num_args > 7
+ %assign num_args 7
+ %endif
+ %if regs_used > 7
+ %assign regs_used 7
+ %endif
+ SETUP_STACK_POINTER %4
+ ASSERT regs_used <= 7
+ PUSH_IF_USED 3, 4, 5, 6
+ ALLOC_STACK %4
+ LOAD_IF_USED 0, 1, 2, 3, 4, 5, 6
+ DEFINE_ARGS_INTERNAL %0, %4, %5
+%endmacro
+
+%define has_epilogue regs_used > 3 || mmsize == 32 || stack_size > 0
+
+%macro RET 0
+ %if stack_size_padded > 0
+ %if required_stack_alignment > STACK_ALIGNMENT
+ mov rsp, rstkm
+ %else
+ add rsp, stack_size_padded
+ %endif
+ %endif
+ POP_IF_USED 6, 5, 4, 3
+ %if mmsize == 32
+ vzeroupper
+ %endif
+ AUTO_REP_RET
+%endmacro
+
+%endif ;======================================================================
+
+%if WIN64 == 0
+ %macro WIN64_SPILL_XMM 1
+ %endmacro
+ %macro WIN64_RESTORE_XMM 1
+ %endmacro
+ %macro WIN64_PUSH_XMM 0
+ %endmacro
+%endif
+
+; On AMD cpus <=K10, an ordinary ret is slow if it immediately follows either
+; a branch or a branch target. So switch to a 2-byte form of ret in that case.
+; We can automatically detect "follows a branch", but not a branch target.
+; (SSSE3 is a sufficient condition to know that your cpu doesn't have this problem.)
+%macro REP_RET 0
+ %if has_epilogue
+ RET
+ %else
+ rep ret
+ %endif
+ annotate_function_size
+%endmacro
+
+%define last_branch_adr $$
+%macro AUTO_REP_RET 0
+ %if notcpuflag(ssse3)
+ times ((last_branch_adr-$)>>31)+1 rep ; times 1 iff $ == last_branch_adr.
+ %endif
+ ret
+ annotate_function_size
+%endmacro
+
+%macro BRANCH_INSTR 0-*
+ %rep %0
+ %macro %1 1-2 %1
+ %2 %1
+ %if notcpuflag(ssse3)
+ %%branch_instr equ $
+ %xdefine last_branch_adr %%branch_instr
+ %endif
+ %endmacro
+ %rotate 1
+ %endrep
+%endmacro
+
+BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae, jna, jnae, jb, jbe, jnb, jnbe, jc, jnc, js, jns, jo, jno, jp, jnp
+
+%macro TAIL_CALL 2 ; callee, is_nonadjacent
+ %if has_epilogue
+ call %1
+ RET
+ %elif %2
+ jmp %1
+ %endif
+ annotate_function_size
+%endmacro
+
+;=============================================================================
+; arch-independent part
+;=============================================================================
+
+%assign function_align 16
+
+; Begin a function.
+; Applies any symbol mangling needed for C linkage, and sets up a define such that
+; subsequent uses of the function name automatically refer to the mangled version.
+; Appends cpuflags to the function name if cpuflags has been specified.
+; The "" empty default parameter is a workaround for nasm, which fails if SUFFIX
+; is empty and we call cglobal_internal with just %1 %+ SUFFIX (without %2).
+%macro cglobal 1-2+ "" ; name, [PROLOGUE args]
+ cglobal_internal 1, %1 %+ SUFFIX, %2
+%endmacro
+%macro cvisible 1-2+ "" ; name, [PROLOGUE args]
+ cglobal_internal 0, %1 %+ SUFFIX, %2
+%endmacro
+%macro cglobal_internal 2-3+
+ annotate_function_size
+ %if %1
+ %xdefine %%FUNCTION_PREFIX private_prefix
+ %xdefine %%VISIBILITY hidden
+ %else
+ %xdefine %%FUNCTION_PREFIX public_prefix
+ %xdefine %%VISIBILITY
+ %endif
+ %ifndef cglobaled_%2
+ %xdefine %2 mangle(%%FUNCTION_PREFIX %+ _ %+ %2)
+ %xdefine %2.skip_prologue %2 %+ .skip_prologue
+ CAT_XDEFINE cglobaled_, %2, 1
+ %endif
+ %xdefine current_function %2
+ %xdefine current_function_section __SECT__
+ %if FORMAT_ELF
+ global %2:function %%VISIBILITY
+ %else
+ global %2
+ %endif
+ align function_align
+ %2:
+ RESET_MM_PERMUTATION ; needed for x86-64, also makes disassembly somewhat nicer
+ %xdefine rstk rsp ; copy of the original stack pointer, used when greater alignment than the known stack alignment is required
+ %assign stack_offset 0 ; stack pointer offset relative to the return address
+ %assign stack_size 0 ; amount of stack space that can be freely used inside a function
+ %assign stack_size_padded 0 ; total amount of allocated stack space, including space for callee-saved xmm registers on WIN64 and alignment padding
+ %assign xmm_regs_used 0 ; number of XMM registers requested, used for dealing with callee-saved registers on WIN64
+ %ifnidn %3, ""
+ PROLOGUE %3
+ %endif
+%endmacro
+
+%macro cextern 1
+ %xdefine %1 mangle(private_prefix %+ _ %+ %1)
+ CAT_XDEFINE cglobaled_, %1, 1
+ extern %1
+%endmacro
+
+; like cextern, but without the prefix
+%macro cextern_naked 1
+ %ifdef PREFIX
+ %xdefine %1 mangle(%1)
+ %endif
+ CAT_XDEFINE cglobaled_, %1, 1
+ extern %1
+%endmacro
+
+%macro const 1-2+
+ %xdefine %1 mangle(private_prefix %+ _ %+ %1)
+ %if FORMAT_ELF
+ global %1:data hidden
+ %else
+ global %1
+ %endif
+ %1: %2
+%endmacro
+
+; This is needed for ELF, otherwise the GNU linker assumes the stack is executable by default.
+%if FORMAT_ELF
+ [SECTION .note.GNU-stack noalloc noexec nowrite progbits]
+%endif
+
+; Tell debuggers how large the function was.
+; This may be invoked multiple times per function; we rely on later instances overriding earlier ones.
+; This is invoked by RET and similar macros, and also cglobal does it for the previous function,
+; but if the last function in a source file doesn't use any of the standard macros for its epilogue,
+; then its size might be unspecified.
+%macro annotate_function_size 0
+ %ifdef __YASM_VER__
+ %ifdef current_function
+ %if FORMAT_ELF
+ current_function_section
+ %%ecf equ $
+ size current_function %%ecf - current_function
+ __SECT__
+ %endif
+ %endif
+ %endif
+%endmacro
+
+; cpuflags
+
+%assign cpuflags_mmx (1<<0)
+%assign cpuflags_mmx2 (1<<1) | cpuflags_mmx
+%assign cpuflags_3dnow (1<<2) | cpuflags_mmx
+%assign cpuflags_3dnowext (1<<3) | cpuflags_3dnow
+%assign cpuflags_sse (1<<4) | cpuflags_mmx2
+%assign cpuflags_sse2 (1<<5) | cpuflags_sse
+%assign cpuflags_sse2slow (1<<6) | cpuflags_sse2
+%assign cpuflags_sse3 (1<<7) | cpuflags_sse2
+%assign cpuflags_ssse3 (1<<8) | cpuflags_sse3
+%assign cpuflags_sse4 (1<<9) | cpuflags_ssse3
+%assign cpuflags_sse42 (1<<10)| cpuflags_sse4
+%assign cpuflags_avx (1<<11)| cpuflags_sse42
+%assign cpuflags_xop (1<<12)| cpuflags_avx
+%assign cpuflags_fma4 (1<<13)| cpuflags_avx
+%assign cpuflags_fma3 (1<<14)| cpuflags_avx
+%assign cpuflags_avx2 (1<<15)| cpuflags_fma3
+
+%assign cpuflags_cache32 (1<<16)
+%assign cpuflags_cache64 (1<<17)
+%assign cpuflags_slowctz (1<<18)
+%assign cpuflags_lzcnt (1<<19)
+%assign cpuflags_aligned (1<<20) ; not a cpu feature, but a function variant
+%assign cpuflags_atom (1<<21)
+%assign cpuflags_bmi1 (1<<22)|cpuflags_lzcnt
+%assign cpuflags_bmi2 (1<<23)|cpuflags_bmi1
+
+; Returns a boolean value expressing whether or not the specified cpuflag is enabled.
+%define cpuflag(x) (((((cpuflags & (cpuflags_ %+ x)) ^ (cpuflags_ %+ x)) - 1) >> 31) & 1)
+%define notcpuflag(x) (cpuflag(x) ^ 1)
+
+; Takes an arbitrary number of cpuflags from the above list.
+; All subsequent functions (up to the next INIT_CPUFLAGS) is built for the specified cpu.
+; You shouldn't need to invoke this macro directly, it's a subroutine for INIT_MMX &co.
+%macro INIT_CPUFLAGS 0-*
+ %xdefine SUFFIX
+ %undef cpuname
+ %assign cpuflags 0
+
+ %if %0 >= 1
+ %rep %0
+ %ifdef cpuname
+ %xdefine cpuname cpuname %+ _%1
+ %else
+ %xdefine cpuname %1
+ %endif
+ %assign cpuflags cpuflags | cpuflags_%1
+ %rotate 1
+ %endrep
+ %xdefine SUFFIX _ %+ cpuname
+
+ %if cpuflag(avx)
+ %assign avx_enabled 1
+ %endif
+ %if (mmsize == 16 && notcpuflag(sse2)) || (mmsize == 32 && notcpuflag(avx2))
+ %define mova movaps
+ %define movu movups
+ %define movnta movntps
+ %endif
+ %if cpuflag(aligned)
+ %define movu mova
+ %elif cpuflag(sse3) && notcpuflag(ssse3)
+ %define movu lddqu
+ %endif
+ %endif
+
+ %if ARCH_X86_64 || cpuflag(sse2)
+ CPUNOP amdnop
+ %else
+ CPUNOP basicnop
+ %endif
+%endmacro
+
+; Merge mmx and sse*
+; m# is a simd register of the currently selected size
+; xm# is the corresponding xmm register if mmsize >= 16, otherwise the same as m#
+; ym# is the corresponding ymm register if mmsize >= 32, otherwise the same as m#
+; (All 3 remain in sync through SWAP.)
+
+%macro CAT_XDEFINE 3
+ %xdefine %1%2 %3
+%endmacro
+
+%macro CAT_UNDEF 2
+ %undef %1%2
+%endmacro
+
+%macro INIT_MMX 0-1+
+ %assign avx_enabled 0
+ %define RESET_MM_PERMUTATION INIT_MMX %1
+ %define mmsize 8
+ %define num_mmregs 8
+ %define mova movq
+ %define movu movq
+ %define movh movd
+ %define movnta movntq
+ %assign %%i 0
+ %rep 8
+ CAT_XDEFINE m, %%i, mm %+ %%i
+ CAT_XDEFINE nnmm, %%i, %%i
+ %assign %%i %%i+1
+ %endrep
+ %rep 8
+ CAT_UNDEF m, %%i
+ CAT_UNDEF nnmm, %%i
+ %assign %%i %%i+1
+ %endrep
+ INIT_CPUFLAGS %1
+%endmacro
+
+%macro INIT_XMM 0-1+
+ %assign avx_enabled 0
+ %define RESET_MM_PERMUTATION INIT_XMM %1
+ %define mmsize 16
+ %define num_mmregs 8
+ %if ARCH_X86_64
+ %define num_mmregs 16
+ %endif
+ %define mova movdqa
+ %define movu movdqu
+ %define movh movq
+ %define movnta movntdq
+ %assign %%i 0
+ %rep num_mmregs
+ CAT_XDEFINE m, %%i, xmm %+ %%i
+ CAT_XDEFINE nnxmm, %%i, %%i
+ %assign %%i %%i+1
+ %endrep
+ INIT_CPUFLAGS %1
+%endmacro
+
+%macro INIT_YMM 0-1+
+ %assign avx_enabled 1
+ %define RESET_MM_PERMUTATION INIT_YMM %1
+ %define mmsize 32
+ %define num_mmregs 8
+ %if ARCH_X86_64
+ %define num_mmregs 16
+ %endif
+ %define mova movdqa
+ %define movu movdqu
+ %undef movh
+ %define movnta movntdq
+ %assign %%i 0
+ %rep num_mmregs
+ CAT_XDEFINE m, %%i, ymm %+ %%i
+ CAT_XDEFINE nnymm, %%i, %%i
+ %assign %%i %%i+1
+ %endrep
+ INIT_CPUFLAGS %1
+%endmacro
+
+INIT_XMM
+
+%macro DECLARE_MMCAST 1
+ %define mmmm%1 mm%1
+ %define mmxmm%1 mm%1
+ %define mmymm%1 mm%1
+ %define xmmmm%1 mm%1
+ %define xmmxmm%1 xmm%1
+ %define xmmymm%1 xmm%1
+ %define ymmmm%1 mm%1
+ %define ymmxmm%1 xmm%1
+ %define ymmymm%1 ymm%1
+ %define xm%1 xmm %+ m%1
+ %define ym%1 ymm %+ m%1
+%endmacro
+
+%assign i 0
+%rep 16
+ DECLARE_MMCAST i
+ %assign i i+1
+%endrep
+
+; I often want to use macros that permute their arguments. e.g. there's no
+; efficient way to implement butterfly or transpose or dct without swapping some
+; arguments.
+;
+; I would like to not have to manually keep track of the permutations:
+; If I insert a permutation in the middle of a function, it should automatically
+; change everything that follows. For more complex macros I may also have multiple
+; implementations, e.g. the SSE2 and SSSE3 versions may have different permutations.
+;
+; Hence these macros. Insert a PERMUTE or some SWAPs at the end of a macro that
+; permutes its arguments. It's equivalent to exchanging the contents of the
+; registers, except that this way you exchange the register names instead, so it
+; doesn't cost any cycles.
+
+%macro PERMUTE 2-* ; takes a list of pairs to swap
+ %rep %0/2
+ %xdefine %%tmp%2 m%2
+ %rotate 2
+ %endrep
+ %rep %0/2
+ %xdefine m%1 %%tmp%2
+ CAT_XDEFINE nn, m%1, %1
+ %rotate 2
+ %endrep
+%endmacro
+
+%macro SWAP 2+ ; swaps a single chain (sometimes more concise than pairs)
+ %ifnum %1 ; SWAP 0, 1, ...
+ SWAP_INTERNAL_NUM %1, %2
+ %else ; SWAP m0, m1, ...
+ SWAP_INTERNAL_NAME %1, %2
+ %endif
+%endmacro
+
+%macro SWAP_INTERNAL_NUM 2-*
+ %rep %0-1
+ %xdefine %%tmp m%1
+ %xdefine m%1 m%2
+ %xdefine m%2 %%tmp
+ CAT_XDEFINE nn, m%1, %1
+ CAT_XDEFINE nn, m%2, %2
+ %rotate 1
+ %endrep
+%endmacro
+
+%macro SWAP_INTERNAL_NAME 2-*
+ %xdefine %%args nn %+ %1
+ %rep %0-1
+ %xdefine %%args %%args, nn %+ %2
+ %rotate 1
+ %endrep
+ SWAP_INTERNAL_NUM %%args
+%endmacro
+
+; If SAVE_MM_PERMUTATION is placed at the end of a function, then any later
+; calls to that function will automatically load the permutation, so values can
+; be returned in mmregs.
+%macro SAVE_MM_PERMUTATION 0-1
+ %if %0
+ %xdefine %%f %1_m
+ %else
+ %xdefine %%f current_function %+ _m
+ %endif
+ %assign %%i 0
+ %rep num_mmregs
+ CAT_XDEFINE %%f, %%i, m %+ %%i
+ %assign %%i %%i+1
+ %endrep
+%endmacro
+
+%macro LOAD_MM_PERMUTATION 1 ; name to load from
+ %ifdef %1_m0
+ %assign %%i 0
+ %rep num_mmregs
+ CAT_XDEFINE m, %%i, %1_m %+ %%i
+ CAT_XDEFINE nn, m %+ %%i, %%i
+ %assign %%i %%i+1
+ %endrep
+ %endif
+%endmacro
+
+; Append cpuflags to the callee's name iff the appended name is known and the plain name isn't
+%macro call 1
+ call_internal %1 %+ SUFFIX, %1
+%endmacro
+%macro call_internal 2
+ %xdefine %%i %2
+ %ifndef cglobaled_%2
+ %ifdef cglobaled_%1
+ %xdefine %%i %1
+ %endif
+ %endif
+ call %%i
+ LOAD_MM_PERMUTATION %%i
+%endmacro
+
+; Substitutions that reduce instruction size but are functionally equivalent
+%macro add 2
+ %ifnum %2
+ %if %2==128
+ sub %1, -128
+ %else
+ add %1, %2
+ %endif
+ %else
+ add %1, %2
+ %endif
+%endmacro
+
+%macro sub 2
+ %ifnum %2
+ %if %2==128
+ add %1, -128
+ %else
+ sub %1, %2
+ %endif
+ %else
+ sub %1, %2
+ %endif
+%endmacro
+
+;=============================================================================
+; AVX abstraction layer
+;=============================================================================
+
+%assign i 0
+%rep 16
+ %if i < 8
+ CAT_XDEFINE sizeofmm, i, 8
+ %endif
+ CAT_XDEFINE sizeofxmm, i, 16
+ CAT_XDEFINE sizeofymm, i, 32
+ %assign i i+1
+%endrep
+%undef i
+
+%macro CHECK_AVX_INSTR_EMU 3-*
+ %xdefine %%opcode %1
+ %xdefine %%dst %2
+ %rep %0-2
+ %ifidn %%dst, %3
+ %error non-avx emulation of ``%%opcode'' is not supported
+ %endif
+ %rotate 1
+ %endrep
+%endmacro
+
+;%1 == instruction
+;%2 == minimal instruction set
+;%3 == 1 if float, 0 if int
+;%4 == 1 if 4-operand emulation, 0 if 3-operand emulation, 255 otherwise (no emulation)
+;%5 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not
+;%6+: operands
+%macro RUN_AVX_INSTR 6-9+
+ %ifnum sizeof%7
+ %assign __sizeofreg sizeof%7
+ %elifnum sizeof%6
+ %assign __sizeofreg sizeof%6
+ %else
+ %assign __sizeofreg mmsize
+ %endif
+ %assign __emulate_avx 0
+ %if avx_enabled && __sizeofreg >= 16
+ %xdefine __instr v%1
+ %else
+ %xdefine __instr %1
+ %if %0 >= 8+%4
+ %assign __emulate_avx 1
+ %endif
+ %endif
+ %ifnidn %2, fnord
+ %ifdef cpuname
+ %if notcpuflag(%2)
+ %error use of ``%1'' %2 instruction in cpuname function: current_function
+ %elif cpuflags_%2 < cpuflags_sse && notcpuflag(sse2) && __sizeofreg > 8
+ %error use of ``%1'' sse2 instruction in cpuname function: current_function
+ %endif
+ %endif
+ %endif
+
+ %if __emulate_avx
+ %xdefine __src1 %7
+ %xdefine __src2 %8
+ %if %5 && %4 == 0
+ %ifnidn %6, %7
+ %ifidn %6, %8
+ %xdefine __src1 %8
+ %xdefine __src2 %7
+ %elifnnum sizeof%8
+ ; 3-operand AVX instructions with a memory arg can only have it in src2,
+ ; whereas SSE emulation prefers to have it in src1 (i.e. the mov).
+ ; So, if the instruction is commutative with a memory arg, swap them.
+ %xdefine __src1 %8
+ %xdefine __src2 %7
+ %endif
+ %endif
+ %endif
+ %ifnidn %6, __src1
+ %if %0 >= 9
+ CHECK_AVX_INSTR_EMU {%1 %6, %7, %8, %9}, %6, __src2, %9
+ %else
+ CHECK_AVX_INSTR_EMU {%1 %6, %7, %8}, %6, __src2
+ %endif
+ %if __sizeofreg == 8
+ MOVQ %6, __src1
+ %elif %3
+ MOVAPS %6, __src1
+ %else
+ MOVDQA %6, __src1
+ %endif
+ %endif
+ %if %0 >= 9
+ %1 %6, __src2, %9
+ %else
+ %1 %6, __src2
+ %endif
+ %elif %0 >= 9
+ __instr %6, %7, %8, %9
+ %elif %0 == 8
+ __instr %6, %7, %8
+ %elif %0 == 7
+ __instr %6, %7
+ %else
+ __instr %6
+ %endif
+%endmacro
+
+;%1 == instruction
+;%2 == minimal instruction set
+;%3 == 1 if float, 0 if int
+;%4 == 1 if 4-operand emulation, 0 if 3-operand emulation, 255 otherwise (no emulation)
+;%5 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not
+%macro AVX_INSTR 1-5 fnord, 0, 255, 0
+ %macro %1 1-10 fnord, fnord, fnord, fnord, %1, %2, %3, %4, %5
+ %ifidn %2, fnord
+ RUN_AVX_INSTR %6, %7, %8, %9, %10, %1
+ %elifidn %3, fnord
+ RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2
+ %elifidn %4, fnord
+ RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3
+ %elifidn %5, fnord
+ RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3, %4
+ %else
+ RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3, %4, %5
+ %endif
+ %endmacro
+%endmacro
+
+; Instructions with both VEX and non-VEX encodings
+; Non-destructive instructions are written without parameters
+AVX_INSTR addpd, sse2, 1, 0, 1
+AVX_INSTR addps, sse, 1, 0, 1
+AVX_INSTR addsd, sse2, 1, 0, 0
+AVX_INSTR addss, sse, 1, 0, 0
+AVX_INSTR addsubpd, sse3, 1, 0, 0
+AVX_INSTR addsubps, sse3, 1, 0, 0
+AVX_INSTR aesdec, fnord, 0, 0, 0
+AVX_INSTR aesdeclast, fnord, 0, 0, 0
+AVX_INSTR aesenc, fnord, 0, 0, 0
+AVX_INSTR aesenclast, fnord, 0, 0, 0
+AVX_INSTR aesimc
+AVX_INSTR aeskeygenassist
+AVX_INSTR andnpd, sse2, 1, 0, 0
+AVX_INSTR andnps, sse, 1, 0, 0
+AVX_INSTR andpd, sse2, 1, 0, 1
+AVX_INSTR andps, sse, 1, 0, 1
+AVX_INSTR blendpd, sse4, 1, 1, 0
+AVX_INSTR blendps, sse4, 1, 1, 0
+AVX_INSTR blendvpd, sse4 ; can't be emulated
+AVX_INSTR blendvps, sse4 ; can't be emulated
+AVX_INSTR cmppd, sse2, 1, 1, 0
+AVX_INSTR cmpps, sse, 1, 1, 0
+AVX_INSTR cmpsd, sse2, 1, 1, 0
+AVX_INSTR cmpss, sse, 1, 1, 0
+AVX_INSTR comisd, sse2
+AVX_INSTR comiss, sse
+AVX_INSTR cvtdq2pd, sse2
+AVX_INSTR cvtdq2ps, sse2
+AVX_INSTR cvtpd2dq, sse2
+AVX_INSTR cvtpd2ps, sse2
+AVX_INSTR cvtps2dq, sse2
+AVX_INSTR cvtps2pd, sse2
+AVX_INSTR cvtsd2si, sse2
+AVX_INSTR cvtsd2ss, sse2, 1, 0, 0
+AVX_INSTR cvtsi2sd, sse2, 1, 0, 0
+AVX_INSTR cvtsi2ss, sse, 1, 0, 0
+AVX_INSTR cvtss2sd, sse2, 1, 0, 0
+AVX_INSTR cvtss2si, sse
+AVX_INSTR cvttpd2dq, sse2
+AVX_INSTR cvttps2dq, sse2
+AVX_INSTR cvttsd2si, sse2
+AVX_INSTR cvttss2si, sse
+AVX_INSTR divpd, sse2, 1, 0, 0
+AVX_INSTR divps, sse, 1, 0, 0
+AVX_INSTR divsd, sse2, 1, 0, 0
+AVX_INSTR divss, sse, 1, 0, 0
+AVX_INSTR dppd, sse4, 1, 1, 0
+AVX_INSTR dpps, sse4, 1, 1, 0
+AVX_INSTR extractps, sse4
+AVX_INSTR haddpd, sse3, 1, 0, 0
+AVX_INSTR haddps, sse3, 1, 0, 0
+AVX_INSTR hsubpd, sse3, 1, 0, 0
+AVX_INSTR hsubps, sse3, 1, 0, 0
+AVX_INSTR insertps, sse4, 1, 1, 0
+AVX_INSTR lddqu, sse3
+AVX_INSTR ldmxcsr, sse
+AVX_INSTR maskmovdqu, sse2
+AVX_INSTR maxpd, sse2, 1, 0, 1
+AVX_INSTR maxps, sse, 1, 0, 1
+AVX_INSTR maxsd, sse2, 1, 0, 0
+AVX_INSTR maxss, sse, 1, 0, 0
+AVX_INSTR minpd, sse2, 1, 0, 1
+AVX_INSTR minps, sse, 1, 0, 1
+AVX_INSTR minsd, sse2, 1, 0, 0
+AVX_INSTR minss, sse, 1, 0, 0
+AVX_INSTR movapd, sse2
+AVX_INSTR movaps, sse
+AVX_INSTR movd, mmx
+AVX_INSTR movddup, sse3
+AVX_INSTR movdqa, sse2
+AVX_INSTR movdqu, sse2
+AVX_INSTR movhlps, sse, 1, 0, 0
+AVX_INSTR movhpd, sse2, 1, 0, 0
+AVX_INSTR movhps, sse, 1, 0, 0
+AVX_INSTR movlhps, sse, 1, 0, 0
+AVX_INSTR movlpd, sse2, 1, 0, 0
+AVX_INSTR movlps, sse, 1, 0, 0
+AVX_INSTR movmskpd, sse2
+AVX_INSTR movmskps, sse
+AVX_INSTR movntdq, sse2
+AVX_INSTR movntdqa, sse4
+AVX_INSTR movntpd, sse2
+AVX_INSTR movntps, sse
+AVX_INSTR movq, mmx
+AVX_INSTR movsd, sse2, 1, 0, 0
+AVX_INSTR movshdup, sse3
+AVX_INSTR movsldup, sse3
+AVX_INSTR movss, sse, 1, 0, 0
+AVX_INSTR movupd, sse2
+AVX_INSTR movups, sse
+AVX_INSTR mpsadbw, sse4, 0, 1, 0
+AVX_INSTR mulpd, sse2, 1, 0, 1
+AVX_INSTR mulps, sse, 1, 0, 1
+AVX_INSTR mulsd, sse2, 1, 0, 0
+AVX_INSTR mulss, sse, 1, 0, 0
+AVX_INSTR orpd, sse2, 1, 0, 1
+AVX_INSTR orps, sse, 1, 0, 1
+AVX_INSTR pabsb, ssse3
+AVX_INSTR pabsd, ssse3
+AVX_INSTR pabsw, ssse3
+AVX_INSTR packsswb, mmx, 0, 0, 0
+AVX_INSTR packssdw, mmx, 0, 0, 0
+AVX_INSTR packuswb, mmx, 0, 0, 0
+AVX_INSTR packusdw, sse4, 0, 0, 0
+AVX_INSTR paddb, mmx, 0, 0, 1
+AVX_INSTR paddw, mmx, 0, 0, 1
+AVX_INSTR paddd, mmx, 0, 0, 1
+AVX_INSTR paddq, sse2, 0, 0, 1
+AVX_INSTR paddsb, mmx, 0, 0, 1
+AVX_INSTR paddsw, mmx, 0, 0, 1
+AVX_INSTR paddusb, mmx, 0, 0, 1
+AVX_INSTR paddusw, mmx, 0, 0, 1
+AVX_INSTR palignr, ssse3, 0, 1, 0
+AVX_INSTR pand, mmx, 0, 0, 1
+AVX_INSTR pandn, mmx, 0, 0, 0
+AVX_INSTR pavgb, mmx2, 0, 0, 1
+AVX_INSTR pavgw, mmx2, 0, 0, 1
+AVX_INSTR pblendvb, sse4 ; can't be emulated
+AVX_INSTR pblendw, sse4, 0, 1, 0
+AVX_INSTR pclmulqdq, fnord, 0, 1, 0
+AVX_INSTR pclmulhqhqdq, fnord, 0, 0, 0
+AVX_INSTR pclmulhqlqdq, fnord, 0, 0, 0
+AVX_INSTR pclmullqhqdq, fnord, 0, 0, 0
+AVX_INSTR pclmullqlqdq, fnord, 0, 0, 0
+AVX_INSTR pcmpestri, sse42
+AVX_INSTR pcmpestrm, sse42
+AVX_INSTR pcmpistri, sse42
+AVX_INSTR pcmpistrm, sse42
+AVX_INSTR pcmpeqb, mmx, 0, 0, 1
+AVX_INSTR pcmpeqw, mmx, 0, 0, 1
+AVX_INSTR pcmpeqd, mmx, 0, 0, 1
+AVX_INSTR pcmpeqq, sse4, 0, 0, 1
+AVX_INSTR pcmpgtb, mmx, 0, 0, 0
+AVX_INSTR pcmpgtw, mmx, 0, 0, 0
+AVX_INSTR pcmpgtd, mmx, 0, 0, 0
+AVX_INSTR pcmpgtq, sse42, 0, 0, 0
+AVX_INSTR pextrb, sse4
+AVX_INSTR pextrd, sse4
+AVX_INSTR pextrq, sse4
+AVX_INSTR pextrw, mmx2
+AVX_INSTR phaddw, ssse3, 0, 0, 0
+AVX_INSTR phaddd, ssse3, 0, 0, 0
+AVX_INSTR phaddsw, ssse3, 0, 0, 0
+AVX_INSTR phminposuw, sse4
+AVX_INSTR phsubw, ssse3, 0, 0, 0
+AVX_INSTR phsubd, ssse3, 0, 0, 0
+AVX_INSTR phsubsw, ssse3, 0, 0, 0
+AVX_INSTR pinsrb, sse4, 0, 1, 0
+AVX_INSTR pinsrd, sse4, 0, 1, 0
+AVX_INSTR pinsrq, sse4, 0, 1, 0
+AVX_INSTR pinsrw, mmx2, 0, 1, 0
+AVX_INSTR pmaddwd, mmx, 0, 0, 1
+AVX_INSTR pmaddubsw, ssse3, 0, 0, 0
+AVX_INSTR pmaxsb, sse4, 0, 0, 1
+AVX_INSTR pmaxsw, mmx2, 0, 0, 1
+AVX_INSTR pmaxsd, sse4, 0, 0, 1
+AVX_INSTR pmaxub, mmx2, 0, 0, 1
+AVX_INSTR pmaxuw, sse4, 0, 0, 1
+AVX_INSTR pmaxud, sse4, 0, 0, 1
+AVX_INSTR pminsb, sse4, 0, 0, 1
+AVX_INSTR pminsw, mmx2, 0, 0, 1
+AVX_INSTR pminsd, sse4, 0, 0, 1
+AVX_INSTR pminub, mmx2, 0, 0, 1
+AVX_INSTR pminuw, sse4, 0, 0, 1
+AVX_INSTR pminud, sse4, 0, 0, 1
+AVX_INSTR pmovmskb, mmx2
+AVX_INSTR pmovsxbw, sse4
+AVX_INSTR pmovsxbd, sse4
+AVX_INSTR pmovsxbq, sse4
+AVX_INSTR pmovsxwd, sse4
+AVX_INSTR pmovsxwq, sse4
+AVX_INSTR pmovsxdq, sse4
+AVX_INSTR pmovzxbw, sse4
+AVX_INSTR pmovzxbd, sse4
+AVX_INSTR pmovzxbq, sse4
+AVX_INSTR pmovzxwd, sse4
+AVX_INSTR pmovzxwq, sse4
+AVX_INSTR pmovzxdq, sse4
+AVX_INSTR pmuldq, sse4, 0, 0, 1
+AVX_INSTR pmulhrsw, ssse3, 0, 0, 1
+AVX_INSTR pmulhuw, mmx2, 0, 0, 1
+AVX_INSTR pmulhw, mmx, 0, 0, 1
+AVX_INSTR pmullw, mmx, 0, 0, 1
+AVX_INSTR pmulld, sse4, 0, 0, 1
+AVX_INSTR pmuludq, sse2, 0, 0, 1
+AVX_INSTR por, mmx, 0, 0, 1
+AVX_INSTR psadbw, mmx2, 0, 0, 1
+AVX_INSTR pshufb, ssse3, 0, 0, 0
+AVX_INSTR pshufd, sse2
+AVX_INSTR pshufhw, sse2
+AVX_INSTR pshuflw, sse2
+AVX_INSTR psignb, ssse3, 0, 0, 0
+AVX_INSTR psignw, ssse3, 0, 0, 0
+AVX_INSTR psignd, ssse3, 0, 0, 0
+AVX_INSTR psllw, mmx, 0, 0, 0
+AVX_INSTR pslld, mmx, 0, 0, 0
+AVX_INSTR psllq, mmx, 0, 0, 0
+AVX_INSTR pslldq, sse2, 0, 0, 0
+AVX_INSTR psraw, mmx, 0, 0, 0
+AVX_INSTR psrad, mmx, 0, 0, 0
+AVX_INSTR psrlw, mmx, 0, 0, 0
+AVX_INSTR psrld, mmx, 0, 0, 0
+AVX_INSTR psrlq, mmx, 0, 0, 0
+AVX_INSTR psrldq, sse2, 0, 0, 0
+AVX_INSTR psubb, mmx, 0, 0, 0
+AVX_INSTR psubw, mmx, 0, 0, 0
+AVX_INSTR psubd, mmx, 0, 0, 0
+AVX_INSTR psubq, sse2, 0, 0, 0
+AVX_INSTR psubsb, mmx, 0, 0, 0
+AVX_INSTR psubsw, mmx, 0, 0, 0
+AVX_INSTR psubusb, mmx, 0, 0, 0
+AVX_INSTR psubusw, mmx, 0, 0, 0
+AVX_INSTR ptest, sse4
+AVX_INSTR punpckhbw, mmx, 0, 0, 0
+AVX_INSTR punpckhwd, mmx, 0, 0, 0
+AVX_INSTR punpckhdq, mmx, 0, 0, 0
+AVX_INSTR punpckhqdq, sse2, 0, 0, 0
+AVX_INSTR punpcklbw, mmx, 0, 0, 0
+AVX_INSTR punpcklwd, mmx, 0, 0, 0
+AVX_INSTR punpckldq, mmx, 0, 0, 0
+AVX_INSTR punpcklqdq, sse2, 0, 0, 0
+AVX_INSTR pxor, mmx, 0, 0, 1
+AVX_INSTR rcpps, sse
+AVX_INSTR rcpss, sse, 1, 0, 0
+AVX_INSTR roundpd, sse4
+AVX_INSTR roundps, sse4
+AVX_INSTR roundsd, sse4, 1, 1, 0
+AVX_INSTR roundss, sse4, 1, 1, 0
+AVX_INSTR rsqrtps, sse
+AVX_INSTR rsqrtss, sse, 1, 0, 0
+AVX_INSTR shufpd, sse2, 1, 1, 0
+AVX_INSTR shufps, sse, 1, 1, 0
+AVX_INSTR sqrtpd, sse2
+AVX_INSTR sqrtps, sse
+AVX_INSTR sqrtsd, sse2, 1, 0, 0
+AVX_INSTR sqrtss, sse, 1, 0, 0
+AVX_INSTR stmxcsr, sse
+AVX_INSTR subpd, sse2, 1, 0, 0
+AVX_INSTR subps, sse, 1, 0, 0
+AVX_INSTR subsd, sse2, 1, 0, 0
+AVX_INSTR subss, sse, 1, 0, 0
+AVX_INSTR ucomisd, sse2
+AVX_INSTR ucomiss, sse
+AVX_INSTR unpckhpd, sse2, 1, 0, 0
+AVX_INSTR unpckhps, sse, 1, 0, 0
+AVX_INSTR unpcklpd, sse2, 1, 0, 0
+AVX_INSTR unpcklps, sse, 1, 0, 0
+AVX_INSTR xorpd, sse2, 1, 0, 1
+AVX_INSTR xorps, sse, 1, 0, 1
+
+; 3DNow instructions, for sharing code between AVX, SSE and 3DN
+AVX_INSTR pfadd, 3dnow, 1, 0, 1
+AVX_INSTR pfsub, 3dnow, 1, 0, 0
+AVX_INSTR pfmul, 3dnow, 1, 0, 1
+
+; base-4 constants for shuffles
+%assign i 0
+%rep 256
+ %assign j ((i>>6)&3)*1000 + ((i>>4)&3)*100 + ((i>>2)&3)*10 + (i&3)
+ %if j < 10
+ CAT_XDEFINE q000, j, i
+ %elif j < 100
+ CAT_XDEFINE q00, j, i
+ %elif j < 1000
+ CAT_XDEFINE q0, j, i
+ %else
+ CAT_XDEFINE q, j, i
+ %endif
+ %assign i i+1
+%endrep
+%undef i
+%undef j
+
+%macro FMA_INSTR 3
+ %macro %1 4-7 %1, %2, %3
+ %if cpuflag(xop)
+ v%5 %1, %2, %3, %4
+ %elifnidn %1, %4
+ %6 %1, %2, %3
+ %7 %1, %4
+ %else
+ %error non-xop emulation of ``%5 %1, %2, %3, %4'' is not supported
+ %endif
+ %endmacro
+%endmacro
+
+FMA_INSTR pmacsww, pmullw, paddw
+FMA_INSTR pmacsdd, pmulld, paddd ; sse4 emulation
+FMA_INSTR pmacsdql, pmuldq, paddq ; sse4 emulation
+FMA_INSTR pmadcswd, pmaddwd, paddd
+
+; tzcnt is equivalent to "rep bsf" and is backwards-compatible with bsf.
+; This lets us use tzcnt without bumping the yasm version requirement yet.
+%define tzcnt rep bsf
+
+; Macros for consolidating FMA3 and FMA4 using 4-operand (dst, src1, src2, src3) syntax.
+; FMA3 is only possible if dst is the same as one of the src registers.
+; Either src2 or src3 can be a memory operand.
+%macro FMA4_INSTR 2-*
+ %push fma4_instr
+ %xdefine %$prefix %1
+ %rep %0 - 1
+ %macro %$prefix%2 4-6 %$prefix, %2
+ %if notcpuflag(fma3) && notcpuflag(fma4)
+ %error use of ``%5%6'' fma instruction in cpuname function: current_function
+ %elif cpuflag(fma4)
+ v%5%6 %1, %2, %3, %4
+ %elifidn %1, %2
+ ; If %3 or %4 is a memory operand it needs to be encoded as the last operand.
+ %ifnum sizeof%3
+ v%{5}213%6 %2, %3, %4
+ %else
+ v%{5}132%6 %2, %4, %3
+ %endif
+ %elifidn %1, %3
+ v%{5}213%6 %3, %2, %4
+ %elifidn %1, %4
+ v%{5}231%6 %4, %2, %3
+ %else
+ %error fma3 emulation of ``%5%6 %1, %2, %3, %4'' is not supported
+ %endif
+ %endmacro
+ %rotate 1
+ %endrep
+ %pop
+%endmacro
+
+FMA4_INSTR fmadd, pd, ps, sd, ss
+FMA4_INSTR fmaddsub, pd, ps
+FMA4_INSTR fmsub, pd, ps, sd, ss
+FMA4_INSTR fmsubadd, pd, ps
+FMA4_INSTR fnmadd, pd, ps, sd, ss
+FMA4_INSTR fnmsub, pd, ps, sd, ss
+
+; workaround: vpbroadcastq is broken in x86_32 due to a yasm bug (fixed in 1.3.0)
+%ifdef __YASM_VER__
+ %if __YASM_VERSION_ID__ < 0x01030000 && ARCH_X86_64 == 0
+ %macro vpbroadcastq 2
+ %if sizeof%1 == 16
+ movddup %1, %2
+ %else
+ vbroadcastsd %1, %2
+ %endif
+ %endmacro
+ %endif
+%endif
diff --git a/src/x86util.asm b/src/x86util.asm
new file mode 100644
index 0000000..66280b2
--- /dev/null
+++ b/src/x86util.asm
@@ -0,0 +1,695 @@
+;*****************************************************************************
+;* x86util.asm
+;*****************************************************************************
+;* Copyright (C) 2008-2010 x264 project
+;*
+;* Authors: Loren Merritt <lorenm@u.washington.edu>
+;* Holger Lubitz <holger@lubitz.org>
+;*
+;* This file is part of Libav.
+;*
+;* Libav is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* Libav is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with Libav; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%define private_prefix qms
+%define public_prefix qms
+%define cpuflags_mmxext cpuflags_mmx2
+
+%include "config.asm"
+
+%include "x86inc.asm"
+
+%macro SBUTTERFLY 4
+%if avx_enabled == 0
+ mova m%4, m%2
+ punpckl%1 m%2, m%3
+ punpckh%1 m%4, m%3
+%else
+ punpckh%1 m%4, m%2, m%3
+ punpckl%1 m%2, m%3
+%endif
+ SWAP %3, %4
+%endmacro
+
+%macro SBUTTERFLY2 4
+ punpckl%1 m%4, m%2, m%3
+ punpckh%1 m%2, m%2, m%3
+ SWAP %2, %4, %3
+%endmacro
+
+%macro SBUTTERFLYPS 3
+ unpcklps m%3, m%1, m%2
+ unpckhps m%1, m%1, m%2
+ SWAP %1, %3, %2
+%endmacro
+
+%macro TRANSPOSE4x4B 5
+ SBUTTERFLY bw, %1, %2, %5
+ SBUTTERFLY bw, %3, %4, %5
+ SBUTTERFLY wd, %1, %3, %5
+ SBUTTERFLY wd, %2, %4, %5
+ SWAP %2, %3
+%endmacro
+
+%macro TRANSPOSE4x4W 5
+ SBUTTERFLY wd, %1, %2, %5
+ SBUTTERFLY wd, %3, %4, %5
+ SBUTTERFLY dq, %1, %3, %5
+ SBUTTERFLY dq, %2, %4, %5
+ SWAP %2, %3
+%endmacro
+
+%macro TRANSPOSE2x4x4W 5
+ SBUTTERFLY wd, %1, %2, %5
+ SBUTTERFLY wd, %3, %4, %5
+ SBUTTERFLY dq, %1, %3, %5
+ SBUTTERFLY dq, %2, %4, %5
+ SBUTTERFLY qdq, %1, %2, %5
+ SBUTTERFLY qdq, %3, %4, %5
+%endmacro
+
+%macro TRANSPOSE4x4D 5
+ SBUTTERFLY dq, %1, %2, %5
+ SBUTTERFLY dq, %3, %4, %5
+ SBUTTERFLY qdq, %1, %3, %5
+ SBUTTERFLY qdq, %2, %4, %5
+ SWAP %2, %3
+%endmacro
+
+; identical behavior to TRANSPOSE4x4D, but using SSE1 float ops
+%macro TRANSPOSE4x4PS 5
+ SBUTTERFLYPS %1, %2, %5
+ SBUTTERFLYPS %3, %4, %5
+ movlhps m%5, m%1, m%3
+ movhlps m%3, m%1
+ SWAP %5, %1
+ movlhps m%5, m%2, m%4
+ movhlps m%4, m%2
+ SWAP %5, %2, %3
+%endmacro
+
+%macro TRANSPOSE8x8W 9-11
+%if ARCH_X86_64
+ SBUTTERFLY wd, %1, %2, %9
+ SBUTTERFLY wd, %3, %4, %9
+ SBUTTERFLY wd, %5, %6, %9
+ SBUTTERFLY wd, %7, %8, %9
+ SBUTTERFLY dq, %1, %3, %9
+ SBUTTERFLY dq, %2, %4, %9
+ SBUTTERFLY dq, %5, %7, %9
+ SBUTTERFLY dq, %6, %8, %9
+ SBUTTERFLY qdq, %1, %5, %9
+ SBUTTERFLY qdq, %2, %6, %9
+ SBUTTERFLY qdq, %3, %7, %9
+ SBUTTERFLY qdq, %4, %8, %9
+ SWAP %2, %5
+ SWAP %4, %7
+%else
+; in: m0..m7, unless %11 in which case m6 is in %9
+; out: m0..m7, unless %11 in which case m4 is in %10
+; spills into %9 and %10
+%if %0<11
+ movdqa %9, m%7
+%endif
+ SBUTTERFLY wd, %1, %2, %7
+ movdqa %10, m%2
+ movdqa m%7, %9
+ SBUTTERFLY wd, %3, %4, %2
+ SBUTTERFLY wd, %5, %6, %2
+ SBUTTERFLY wd, %7, %8, %2
+ SBUTTERFLY dq, %1, %3, %2
+ movdqa %9, m%3
+ movdqa m%2, %10
+ SBUTTERFLY dq, %2, %4, %3
+ SBUTTERFLY dq, %5, %7, %3
+ SBUTTERFLY dq, %6, %8, %3
+ SBUTTERFLY qdq, %1, %5, %3
+ SBUTTERFLY qdq, %2, %6, %3
+ movdqa %10, m%2
+ movdqa m%3, %9
+ SBUTTERFLY qdq, %3, %7, %2
+ SBUTTERFLY qdq, %4, %8, %2
+ SWAP %2, %5
+ SWAP %4, %7
+%if %0<11
+ movdqa m%5, %10
+%endif
+%endif
+%endmacro
+
+; PABSW macro assumes %1 != %2, while ABS1/2 macros work in-place
+%macro PABSW 2
+%if cpuflag(ssse3)
+ pabsw %1, %2
+%elif cpuflag(mmxext)
+ pxor %1, %1
+ psubw %1, %2
+ pmaxsw %1, %2
+%else
+ pxor %1, %1
+ pcmpgtw %1, %2
+ pxor %2, %1
+ psubw %2, %1
+ SWAP %1, %2
+%endif
+%endmacro
+
+%macro PSIGNW_MMX 2
+ pxor %1, %2
+ psubw %1, %2
+%endmacro
+
+%macro PSIGNW_SSSE3 2
+ psignw %1, %2
+%endmacro
+
+%macro ABS1 2
+%if cpuflag(ssse3)
+ pabsw %1, %1
+%elif cpuflag(mmxext) ; a, tmp
+ pxor %2, %2
+ psubw %2, %1
+ pmaxsw %1, %2
+%else ; a, tmp
+ pxor %2, %2
+ pcmpgtw %2, %1
+ pxor %1, %2
+ psubw %1, %2
+%endif
+%endmacro
+
+%macro ABS2 4
+%if cpuflag(ssse3)
+ pabsw %1, %1
+ pabsw %2, %2
+%elif cpuflag(mmxext) ; a, b, tmp0, tmp1
+ pxor %3, %3
+ pxor %4, %4
+ psubw %3, %1
+ psubw %4, %2
+ pmaxsw %1, %3
+ pmaxsw %2, %4
+%else ; a, b, tmp0, tmp1
+ pxor %3, %3
+ pxor %4, %4
+ pcmpgtw %3, %1
+ pcmpgtw %4, %2
+ pxor %1, %3
+ pxor %2, %4
+ psubw %1, %3
+ psubw %2, %4
+%endif
+%endmacro
+
+%macro ABSB 2 ; source mmreg, temp mmreg (unused for ssse3)
+%if cpuflag(ssse3)
+ pabsb %1, %1
+%else
+ pxor %2, %2
+ psubb %2, %1
+ pminub %1, %2
+%endif
+%endmacro
+
+%macro ABSB2 4 ; src1, src2, tmp1, tmp2 (tmp1/2 unused for SSSE3)
+%if cpuflag(ssse3)
+ pabsb %1, %1
+ pabsb %2, %2
+%else
+ pxor %3, %3
+ pxor %4, %4
+ psubb %3, %1
+ psubb %4, %2
+ pminub %1, %3
+ pminub %2, %4
+%endif
+%endmacro
+
+%macro ABSD2_MMX 4
+ pxor %3, %3
+ pxor %4, %4
+ pcmpgtd %3, %1
+ pcmpgtd %4, %2
+ pxor %1, %3
+ pxor %2, %4
+ psubd %1, %3
+ psubd %2, %4
+%endmacro
+
+%macro ABS4 6
+ ABS2 %1, %2, %5, %6
+ ABS2 %3, %4, %5, %6
+%endmacro
+
+%macro SPLATB_LOAD 3
+%if cpuflag(ssse3)
+ movd %1, [%2-3]
+ pshufb %1, %3
+%else
+ movd %1, [%2-3] ;to avoid crossing a cacheline
+ punpcklbw %1, %1
+ SPLATW %1, %1, 3
+%endif
+%endmacro
+
+%macro SPLATB_REG 3
+%if cpuflag(ssse3)
+ movd %1, %2d
+ pshufb %1, %3
+%else
+ movd %1, %2d
+ punpcklbw %1, %1
+ SPLATW %1, %1, 0
+%endif
+%endmacro
+
+%macro PALIGNR 4-5
+%if cpuflag(ssse3)
+%if %0==5
+ palignr %1, %2, %3, %4
+%else
+ palignr %1, %2, %3
+%endif
+%elif cpuflag(mmx) ; [dst,] src1, src2, imm, tmp
+ %define %%dst %1
+%if %0==5
+%ifnidn %1, %2
+ mova %%dst, %2
+%endif
+ %rotate 1
+%endif
+%ifnidn %4, %2
+ mova %4, %2
+%endif
+%if mmsize==8
+ psllq %%dst, (8-%3)*8
+ psrlq %4, %3*8
+%else
+ pslldq %%dst, 16-%3
+ psrldq %4, %3
+%endif
+ por %%dst, %4
+%endif
+%endmacro
+
+%macro PAVGB 2
+%if cpuflag(mmxext)
+ pavgb %1, %2
+%elif cpuflag(3dnow)
+ pavgusb %1, %2
+%endif
+%endmacro
+
+%macro PSHUFLW 1+
+ %if mmsize == 8
+ pshufw %1
+ %else
+ pshuflw %1
+ %endif
+%endmacro
+
+%macro PSWAPD 2
+%if cpuflag(mmxext)
+ pshufw %1, %2, q1032
+%elif cpuflag(3dnowext)
+ pswapd %1, %2
+%elif cpuflag(3dnow)
+ movq %1, %2
+ psrlq %1, 32
+ punpckldq %1, %2
+%endif
+%endmacro
+
+%macro DEINTB 5 ; mask, reg1, mask, reg2, optional src to fill masks from
+%ifnum %5
+ pand m%3, m%5, m%4 ; src .. y6 .. y4
+ pand m%1, m%5, m%2 ; dst .. y6 .. y4
+%else
+ mova m%1, %5
+ pand m%3, m%1, m%4 ; src .. y6 .. y4
+ pand m%1, m%1, m%2 ; dst .. y6 .. y4
+%endif
+ psrlw m%2, 8 ; dst .. y7 .. y5
+ psrlw m%4, 8 ; src .. y7 .. y5
+%endmacro
+
+%macro SUMSUB_BA 3-4
+%if %0==3
+ padd%1 m%2, m%3
+ padd%1 m%3, m%3
+ psub%1 m%3, m%2
+%else
+%if avx_enabled == 0
+ mova m%4, m%2
+ padd%1 m%2, m%3
+ psub%1 m%3, m%4
+%else
+ padd%1 m%4, m%2, m%3
+ psub%1 m%3, m%2
+ SWAP %2, %4
+%endif
+%endif
+%endmacro
+
+%macro SUMSUB_BADC 5-6
+%if %0==6
+ SUMSUB_BA %1, %2, %3, %6
+ SUMSUB_BA %1, %4, %5, %6
+%else
+ padd%1 m%2, m%3
+ padd%1 m%4, m%5
+ padd%1 m%3, m%3
+ padd%1 m%5, m%5
+ psub%1 m%3, m%2
+ psub%1 m%5, m%4
+%endif
+%endmacro
+
+%macro SUMSUB2_AB 4
+%ifnum %3
+ psub%1 m%4, m%2, m%3
+ psub%1 m%4, m%3
+ padd%1 m%2, m%2
+ padd%1 m%2, m%3
+%else
+ mova m%4, m%2
+ padd%1 m%2, m%2
+ padd%1 m%2, %3
+ psub%1 m%4, %3
+ psub%1 m%4, %3
+%endif
+%endmacro
+
+%macro SUMSUB2_BA 4
+%if avx_enabled == 0
+ mova m%4, m%2
+ padd%1 m%2, m%3
+ padd%1 m%2, m%3
+ psub%1 m%3, m%4
+ psub%1 m%3, m%4
+%else
+ padd%1 m%4, m%2, m%3
+ padd%1 m%4, m%3
+ psub%1 m%3, m%2
+ psub%1 m%3, m%2
+ SWAP %2, %4
+%endif
+%endmacro
+
+%macro SUMSUBD2_AB 5
+%ifnum %4
+ psra%1 m%5, m%2, 1 ; %3: %3>>1
+ psra%1 m%4, m%3, 1 ; %2: %2>>1
+ padd%1 m%4, m%2 ; %3: %3>>1+%2
+ psub%1 m%5, m%3 ; %2: %2>>1-%3
+ SWAP %2, %5
+ SWAP %3, %4
+%else
+ mova %5, m%2
+ mova %4, m%3
+ psra%1 m%3, 1 ; %3: %3>>1
+ psra%1 m%2, 1 ; %2: %2>>1
+ padd%1 m%3, %5 ; %3: %3>>1+%2
+ psub%1 m%2, %4 ; %2: %2>>1-%3
+%endif
+%endmacro
+
+%macro DCT4_1D 5
+%ifnum %5
+ SUMSUB_BADC w, %4, %1, %3, %2, %5
+ SUMSUB_BA w, %3, %4, %5
+ SUMSUB2_AB w, %1, %2, %5
+ SWAP %1, %3, %4, %5, %2
+%else
+ SUMSUB_BADC w, %4, %1, %3, %2
+ SUMSUB_BA w, %3, %4
+ mova [%5], m%2
+ SUMSUB2_AB w, %1, [%5], %2
+ SWAP %1, %3, %4, %2
+%endif
+%endmacro
+
+%macro IDCT4_1D 6-7
+%ifnum %6
+ SUMSUBD2_AB %1, %3, %5, %7, %6
+ ; %3: %3>>1-%5 %5: %3+%5>>1
+ SUMSUB_BA %1, %4, %2, %7
+ ; %4: %2+%4 %2: %2-%4
+ SUMSUB_BADC %1, %5, %4, %3, %2, %7
+ ; %5: %2+%4 + (%3+%5>>1)
+ ; %4: %2+%4 - (%3+%5>>1)
+ ; %3: %2-%4 + (%3>>1-%5)
+ ; %2: %2-%4 - (%3>>1-%5)
+%else
+%ifidn %1, w
+ SUMSUBD2_AB %1, %3, %5, [%6], [%6+16]
+%else
+ SUMSUBD2_AB %1, %3, %5, [%6], [%6+32]
+%endif
+ SUMSUB_BA %1, %4, %2
+ SUMSUB_BADC %1, %5, %4, %3, %2
+%endif
+ SWAP %2, %5, %4
+ ; %2: %2+%4 + (%3+%5>>1) row0
+ ; %3: %2-%4 + (%3>>1-%5) row1
+ ; %4: %2-%4 - (%3>>1-%5) row2
+ ; %5: %2+%4 - (%3+%5>>1) row3
+%endmacro
+
+
+%macro LOAD_DIFF 5
+%ifidn %3, none
+ movh %1, %4
+ movh %2, %5
+ punpcklbw %1, %2
+ punpcklbw %2, %2
+ psubw %1, %2
+%else
+ movh %1, %4
+ punpcklbw %1, %3
+ movh %2, %5
+ punpcklbw %2, %3
+ psubw %1, %2
+%endif
+%endmacro
+
+%macro STORE_DCT 6
+ movq [%5+%6+ 0], m%1
+ movq [%5+%6+ 8], m%2
+ movq [%5+%6+16], m%3
+ movq [%5+%6+24], m%4
+ movhps [%5+%6+32], m%1
+ movhps [%5+%6+40], m%2
+ movhps [%5+%6+48], m%3
+ movhps [%5+%6+56], m%4
+%endmacro
+
+%macro LOAD_DIFF_8x4P 7-10 r0,r2,0 ; 4x dest, 2x temp, 2x pointer, increment?
+ LOAD_DIFF m%1, m%5, m%7, [%8], [%9]
+ LOAD_DIFF m%2, m%6, m%7, [%8+r1], [%9+r3]
+ LOAD_DIFF m%3, m%5, m%7, [%8+2*r1], [%9+2*r3]
+ LOAD_DIFF m%4, m%6, m%7, [%8+r4], [%9+r5]
+%if %10
+ lea %8, [%8+4*r1]
+ lea %9, [%9+4*r3]
+%endif
+%endmacro
+
+%macro DIFFx2 6-7
+ movh %3, %5
+ punpcklbw %3, %4
+ psraw %1, 6
+ paddsw %1, %3
+ movh %3, %6
+ punpcklbw %3, %4
+ psraw %2, 6
+ paddsw %2, %3
+ packuswb %2, %1
+%endmacro
+
+%macro STORE_DIFF 4
+ movh %2, %4
+ punpcklbw %2, %3
+ psraw %1, 6
+ paddsw %1, %2
+ packuswb %1, %1
+ movh %4, %1
+%endmacro
+
+%macro STORE_DIFFx2 8 ; add1, add2, reg1, reg2, zero, shift, source, stride
+ movh %3, [%7]
+ movh %4, [%7+%8]
+ psraw %1, %6
+ psraw %2, %6
+ punpcklbw %3, %5
+ punpcklbw %4, %5
+ paddw %3, %1
+ paddw %4, %2
+ packuswb %3, %5
+ packuswb %4, %5
+ movh [%7], %3
+ movh [%7+%8], %4
+%endmacro
+
+%macro PMINUB 3 ; dst, src, ignored
+%if cpuflag(mmxext)
+ pminub %1, %2
+%else ; dst, src, tmp
+ mova %3, %1
+ psubusb %3, %2
+ psubb %1, %3
+%endif
+%endmacro
+
+%macro SPLATW 2-3 0
+%if mmsize == 16
+ pshuflw %1, %2, (%3)*0x55
+ punpcklqdq %1, %1
+%elif cpuflag(mmxext)
+ pshufw %1, %2, (%3)*0x55
+%else
+ %ifnidn %1, %2
+ mova %1, %2
+ %endif
+ %if %3 & 2
+ punpckhwd %1, %1
+ %else
+ punpcklwd %1, %1
+ %endif
+ %if %3 & 1
+ punpckhwd %1, %1
+ %else
+ punpcklwd %1, %1
+ %endif
+%endif
+%endmacro
+
+%macro SPLATD 1
+%if mmsize == 8
+ punpckldq %1, %1
+%elif cpuflag(sse2)
+ pshufd %1, %1, 0
+%elif cpuflag(sse)
+ shufps %1, %1, 0
+%endif
+%endmacro
+
+%macro CLIPW 3 ;(dst, min, max)
+ pmaxsw %1, %2
+ pminsw %1, %3
+%endmacro
+
+%macro PMINSD_MMX 3 ; dst, src, tmp
+ mova %3, %2
+ pcmpgtd %3, %1
+ pxor %1, %2
+ pand %1, %3
+ pxor %1, %2
+%endmacro
+
+%macro PMAXSD_MMX 3 ; dst, src, tmp
+ mova %3, %1
+ pcmpgtd %3, %2
+ pand %1, %3
+ pandn %3, %2
+ por %1, %3
+%endmacro
+
+%macro CLIPD_MMX 3-4 ; src/dst, min, max, tmp
+ PMINSD_MMX %1, %3, %4
+ PMAXSD_MMX %1, %2, %4
+%endmacro
+
+%macro CLIPD_SSE2 3-4 ; src/dst, min (float), max (float), unused
+ cvtdq2ps %1, %1
+ minps %1, %3
+ maxps %1, %2
+ cvtps2dq %1, %1
+%endmacro
+
+%macro CLIPD_SSE41 3-4 ; src/dst, min, max, unused
+ pminsd %1, %3
+ pmaxsd %1, %2
+%endmacro
+
+%macro VBROADCASTSS 2 ; dst xmm/ymm, src m32
+%if cpuflag(avx)
+ vbroadcastss %1, %2
+%else ; sse
+ movss %1, %2
+ shufps %1, %1, 0
+%endif
+%endmacro
+
+%macro VBROADCASTSD 2 ; dst xmm/ymm, src m64
+%if cpuflag(avx) && mmsize == 32
+ vbroadcastsd %1, %2
+%elif cpuflag(sse3)
+ movddup %1, %2
+%else ; sse2
+ movsd %1, %2
+ movlhps %1, %1
+%endif
+%endmacro
+
+%macro SHUFFLE_MASK_W 8
+ %rep 8
+ %if %1>=0x80
+ db %1, %1
+ %else
+ db %1*2
+ db %1*2+1
+ %endif
+ %rotate 1
+ %endrep
+%endmacro
+
+%macro PMOVSXWD 2; dst, src
+%if cpuflag(sse4)
+ pmovsxwd %1, %2
+%else
+ %ifnidn %1, %2
+ mova %1, %2
+ %endif
+ punpcklwd %1, %1
+ psrad %1, 16
+%endif
+%endmacro
+
+; Wrapper for non-FMA version of fmaddps
+%macro FMULADD_PS 5
+ %if cpuflag(fma3) || cpuflag(fma4)
+ fmaddps %1, %2, %3, %4
+ %elifidn %1, %4
+ mulps %5, %2, %3
+ addps %1, %4, %5
+ %else
+ mulps %1, %2, %3
+ addps %1, %4
+ %endif
+%endmacro
+
+; Wrapper for non-FMA version of fmaddpd
+%macro FMULADD_PD 5
+ %if cpuflag(fma3) || cpuflag(fma4)
+ fmaddpd %1, %2, %3, %4
+ %elifidn %1, %4
+ mulpd %5, %2, %3
+ addpd %1, %4, %5
+ %else
+ mulpd %1, %2, %3
+ addpd %1, %4
+ %endif
+%endmacro