diff options
author | Anton Khirnov <anton@khirnov.net> | 2018-04-07 18:13:49 +0200 |
---|---|---|
committer | Anton Khirnov <anton@khirnov.net> | 2018-04-07 18:13:49 +0200 |
commit | d53f73b7f7c728e96ffc07b55fa30b9e6fc5121c (patch) | |
tree | c79d588e646ea3e65b2db1532d65bb691ee88b0e |
Initial commit.
-rw-r--r-- | configuration.ccl | 2 | ||||
-rw-r--r-- | interface.ccl | 16 | ||||
-rw-r--r-- | param.ccl | 37 | ||||
-rw-r--r-- | schedule.ccl | 30 | ||||
-rw-r--r-- | src/MinimalDistortion.m | 1490 | ||||
-rw-r--r-- | src/basis.c | 281 | ||||
-rw-r--r-- | src/basis.h | 45 | ||||
-rw-r--r-- | src/bicgstab.c | 410 | ||||
-rw-r--r-- | src/bicgstab.h | 60 | ||||
-rw-r--r-- | src/common.h | 29 | ||||
-rw-r--r-- | src/config.asm | 1325 | ||||
-rw-r--r-- | src/expansion.asm | 91 | ||||
-rw-r--r-- | src/gamma_freeze_template.c | 507 | ||||
-rw-r--r-- | src/make.code.defn | 7 | ||||
-rw-r--r-- | src/md.c | 573 | ||||
-rw-r--r-- | src/md.h | 19 | ||||
-rw-r--r-- | src/md_solve.c | 818 | ||||
-rw-r--r-- | src/md_solve.h | 58 | ||||
-rw-r--r-- | src/md_solve_template.c | 577 | ||||
-rw-r--r-- | src/pssolve.c | 498 | ||||
-rw-r--r-- | src/pssolve.h | 139 | ||||
-rw-r--r-- | src/register.c | 7 | ||||
-rw-r--r-- | src/threadpool.c | 174 | ||||
-rw-r--r-- | src/threadpool.h | 32 | ||||
-rw-r--r-- | src/x86inc.asm | 1544 | ||||
-rw-r--r-- | src/x86util.asm | 695 |
26 files changed, 9464 insertions, 0 deletions
diff --git a/configuration.ccl b/configuration.ccl new file mode 100644 index 0000000..3565166 --- /dev/null +++ b/configuration.ccl @@ -0,0 +1,2 @@ +# Configuration definition for thorn MinimalDistortionAxi + diff --git a/interface.ccl b/interface.ccl new file mode 100644 index 0000000..703f924 --- /dev/null +++ b/interface.ccl @@ -0,0 +1,16 @@ +# Interface definition for thorn MinimalDistortionAxi +implements: MinimalDistortionAxi + +INHERITS: ADMBase grid CoordBase MethodOfLines + +CCTK_INT FUNCTION MoLRegisterConstrained(CCTK_INT IN idx) +CCTK_INT FUNCTION MoLRegisterSaveAndRestore(CCTK_INT IN idx) +CCTK_INT FUNCTION MoLRegisterSaveAndRestoreGroup(CCTK_INT IN idx) + +REQUIRES FUNCTION MoLRegisterConstrained +REQUIRES FUNCTION MoLRegisterSaveAndRestore +REQUIRES FUNCTION MoLRegisterSaveAndRestoreGroup + +public: +CCTK_REAL betax_coeffs TYPE=array DIM=2 SIZE=basis_order_z,basis_order_r DISTRIB=constant +CCTK_REAL betaz_coeffs TYPE=array DIM=2 SIZE=basis_order_z,basis_order_r DISTRIB=constant diff --git a/param.ccl b/param.ccl new file mode 100644 index 0000000..3c3d285 --- /dev/null +++ b/param.ccl @@ -0,0 +1,37 @@ +# Parameter definitions for thorn MinimalDistortionAxi +# +SHARES: ADMBase +EXTENDS KEYWORD shift_evolution_method +{ + "minimal_distortion_axi" :: "minimal distortion axi" +} + +RESTRICTED: +CCTK_INT basis_order_r "Number of the basis functions in the radial direction" STEERABLE=recover +{ + 1: :: "" +} 40 + +CCTK_INT basis_order_z "Number of the basis functions in the z direction" STEERABLE=recover +{ + 1: :: "" +} 40 + +CCTK_REAL filter_power "" STEERABLE=recover +{ + 0: :: "" +} 64.0 + +CCTK_REAL scale_factor "" STEERABLE=recover +{ + 0: :: "" +} 64.0 + +CCTK_REAL scale_power "" STEERABLE=recover +{ + 0: :: "" +} 64.0 + +BOOLEAN export_coeffs "Export the coefficients of the spectral expansion in beta*_coeffs" STEERABLE=recover +{ +} "no" diff --git a/schedule.ccl b/schedule.ccl new file mode 100644 index 0000000..5976500 --- /dev/null +++ b/schedule.ccl @@ -0,0 +1,30 @@ +# Schedule definitions for thorn MinimalDistortionAxi +# +if (CCTK_Equals(shift_evolution_method, "minimal_distortion_axi")) { + SCHEDULE minimal_distortion_eval IN ML_BSSN_evolCalcGroup BEFORE ML_BSSN_RHS { + LANG: C + } "Minimal distortion shift eval" + + SCHEDULE minimal_distortion_solve IN ML_BSSN_evolCalcGroup BEFORE minimal_distortion_eval { + #SCHEDULE minimal_distortion_solve IN MoL_PreStep { + LANG: C + } "Minimal distortion solve W" + + #SCHEDULE quasimaximal_slicing_axi IN MoL_PseudoEvolution { + # LANG: C + #} "Quasimaximal slicing" + + SCHEDULE minimal_distortion_init IN ADMBase_InitialData { + LANG: C + } "" + + SCHEDULE minimal_distortion_axi_register_mol IN MoL_Register { + LANG: C + } "" + + + if (export_coeffs) { + STORAGE: betax_coeffs + STORAGE: betaz_coeffs + } +} diff --git a/src/MinimalDistortion.m b/src/MinimalDistortion.m new file mode 100644 index 0000000..c0d34a6 --- /dev/null +++ b/src/MinimalDistortion.m @@ -0,0 +1,1490 @@ + +SetEnhancedTimes[False]; +SetSourceLanguage["C"]; + +(******************************************************************************) +(* Options *) +(******************************************************************************) + +createCode[derivOrder_, useJacobian_, splitUpwindDerivs_, evolutionTimelevels_, addMatter_, formulation_] := +Module[{prefix, suffix, thorn}, + +prefix = "ML_"; +suffix = + "" + <> If [useJacobian, "_MP", ""] + <> If [derivOrder!=4, "_O" <> ToString[derivOrder], ""] + <> If [splitUpwindDerivs, "", "_UPW"] + (* <> If [evolutionTimelevels!=3, "_TL" <> ToString[evolutionTimelevels], ""] *) + (* <> If [addMatter==1, "_M", ""] *) + ; + +thorn = prefix <> formulation <> suffix; + +SetAttributes[IfCCZ4, HoldAll]; +IfCCZ4[expr_, else_:Sequence[]] := If[formulation === "CCZ4", expr, Unevaluated[else]]; + +(******************************************************************************) +(* Derivatives *) +(******************************************************************************) + +KD = KroneckerDelta; + +derivatives = +{ + PDstandardNth[i_] -> StandardCenteredDifferenceOperator[1,fdOrder/2,i], + PDstandardNth[i_,i_] -> StandardCenteredDifferenceOperator[2,fdOrder/2,i], + PDstandardNth[i_,j_] -> StandardCenteredDifferenceOperator[1,fdOrder/2,i] * + StandardCenteredDifferenceOperator[1,fdOrder/2,j], + PDdissipationNth[i_] -> + (-1)^(fdOrder/2) * + spacing[i]^(fdOrder+1) / 2^(fdOrder+2) * + StandardCenteredDifferenceOperator[fdOrder+2,fdOrder/2+1,i], + +(* PD: These come from my mathematica notebook + "Upwind-Kranc-Convert.nb" that converts upwinding finite + differencing operators generated by + StandardUpwindDifferenceOperator into this form *) + + Sequence@@Flatten[Table[ + {PDupwindNth[i] -> Switch[fdOrder, + 2, (dir[i]*(-3 + 4*shift[i]^dir[i] - shift[i]^(2*dir[i])))/(2*spacing[i]), + 4, (dir[i]*(-10 - 3/shift[i]^dir[i] + 18*shift[i]^dir[i] - + 6*shift[i]^(2*dir[i]) + shift[i]^(3*dir[i])))/(12*spacing[i]), + 6, (dir[i]*(-35 + 2/shift[i]^(2*dir[i]) - 24/shift[i]^dir[i] + 80*shift[i]^dir[i] - + 30*shift[i]^(2*dir[i]) + 8*shift[i]^(3*dir[i]) - shift[i]^(4*dir[i])))/(60*spacing[i]), + 8, (dir[i]*(-378 - 5/shift[i]^(3*dir[i]) + 60/shift[i]^(2*dir[i]) - 420/shift[i]^dir[i] + + 1050*shift[i]^dir[i] - 420*shift[i]^(2*dir[i]) + 140*shift[i]^(3*dir[i]) - 30*shift[i]^(4*dir[i]) + + 3*shift[i]^(5*dir[i])))/(840*spacing[i])], + + PDupwindNthAnti[i] -> Switch[fdOrder, + 2, (+1 shift[i]^(-2) -4 shift[i]^(-1) +0 shift[i]^( 0) +4 shift[i]^(+1) -1 shift[i]^(+2)) / (4 spacing[i]), + 4, (-1 shift[i]^(-3) +6 shift[i]^(-2) -21 shift[i]^(-1 )+0 shift[i]^( 0) +21 shift[i]^(+1) + -6 shift[i]^(+2) +1 shift[i]^(+3)) / (24 spacing[i]), + 6, (+1 shift[i]^(-4) -8 shift[i]^(-3) +32 shift[i]^(-2) -104 shift[i]^(-1) +0 shift[i]^( 0) + +104 shift[i]^(+1) -32 shift[i]^(+2) +8 shift[i]^(+3) -1 shift[i]^(+4)) / (120 spacing[i]), + 8, (-3 shift[i]^(-5) +30 shift[i]^(-4) -145 shift[i]^(-3) +480 shift[i]^(-2) -1470 shift[i]^(-1) + +0 shift[i]^( 0) +1470 shift[i]^(+1) -480 shift[i]^(+2) +145 shift[i]^(+3) -30 shift[i]^(+4) + +3 shift[i]^(+5)) / (1680 spacing[i])], + + PDupwindNthSymm[i] -> Switch[fdOrder, + 2, (-1 shift[i]^(-2) +4 shift[i]^(-1) -6 shift[i]^( 0) +4 shift[i]^(+1) -1 shift[i]^(+2)) / (4 spacing[i]), + 4, (+1 shift[i]^(-3) -6 shift[i]^(-2) +15 shift[i]^(-1) -20 shift[i]^( 0) +15 shift[i]^(+1) + -6 shift[i]^(+2) +1 shift[i]^(+3)) / (24 spacing[i]), + 6, (-1 shift[i]^(-4) +8 shift[i]^(-3) - 28 shift[i]^(-2)+56 shift[i]^(-1)-70 shift[i]^( 0) + +56 shift[i]^(+1) -28 shift[i]^(+2) +8 shift[i]^(+3) -1 shift[i]^(+4)) / (120 spacing[i]), + 8, (+3 shift[i]^(-5) -30 shift[i]^(-4) +135 shift[i]^(-3) -360 shift[i]^(-2) +630 shift[i]^(-1) + -756 shift[i]^( 0) +630 shift[i]^(+1) -360 shift[i]^(+2) +135 shift[i]^(+3) -30 shift[i]^(+4) + +3 shift[i]^(+5)) / (1680 spacing[i])], + + (* TODO: make these higher order stencils *) + PDonesided[i] -> dir[i] (-1 + shift[i]^dir[i]) / spacing[i]} /. i->j, {j,1,3}],1] +}; + +PD = PDstandardNth; +PDu = PDupwindNth; +PDua = PDupwindNthAnti; +PDus = PDupwindNthSymm; +(* PDo = PDonesided; *) +PDdiss = PDdissipationNth; + +If [splitUpwindDerivs, + Upwind[dir_, var_, idx_] := dir PDua[var,idx] + Abs[dir] PDus[var,idx], + Upwind[dir_, var_, idx_] := dir PDu[var,idx]]; + + + +(******************************************************************************) +(* Tensors *) +(******************************************************************************) + +(* Register the tensor quantities with the TensorTools package *) +Map [DefineTensor, + {normal, tangentA, tangentB, dir, + nn, nu, nlen, nlen2, su, vg, + xx, rr, th, ph, + admg, admK, admalpha, admdtalpha, qmsw, admbeta, admdtbeta, H, M, term1, term2, term3, + g, detg, gu, G, R, trR, Km, trK, cdphi, cdphi2, + phi, gt, At, Xt, Xtn, Theta, Z, + (* + alpha, A, + *) + alpha, + beta, B, Atm, Atu, trA, Ats, trAts, + Kdot, Xtdot, phidot, K, Km, + dottrK, dotXt, + cXt, cS, cA, + e4phi, em4phi, ddetg, detgt, gtu, ddetgt, dgtu, ddgtu, Gtl, Gtlu, Gt, Ddetgt, + Rt, Rphi, gK, + T00, T0, T, rho, S, + x, y, z, r, + epsdiss}]; + +(* NOTE: It seems as if Lie[.,.] did not take these tensor weights + into account. Presumably, CD[.,.] and CDt[.,.] don't do this either. *) +SetTensorAttribute[phi, TensorWeight, +1/6]; +SetTensorAttribute[gt, TensorWeight, -2/3]; +SetTensorAttribute[Xt, TensorWeight, +2/3]; +SetTensorAttribute[At, TensorWeight, -2/3]; +SetTensorAttribute[cXt, TensorWeight, +2/3]; +SetTensorAttribute[cS, TensorWeight, +2 ]; + +Map [AssertSymmetricIncreasing, + {admg[la,lb], admK[la,lb], g[la,lb], K[la,lb], R[la,lb], cdphi2[la,lb], + gt[la,lb], At[la,lb], Ats[la,lb], Rt[la,lb], Rphi[la,lb], T[la,lb], Kdot[la, lb]}]; +AssertSymmetricIncreasing [G[ua,lb,lc], lb, lc]; +AssertSymmetricIncreasing [Gtl[la,lb,lc], lb, lc]; +AssertSymmetricIncreasing [Gt[ua,lb,lc], lb, lc]; +AssertSymmetricIncreasing [gK[la,lb,lc], la, lb]; +Map [AssertSymmetricIncreasing, + {gu[ua,ub], gtu[ua,ub], Atu[ua,ub]}]; +AssertSymmetricIncreasing [dgtu[ua,ub,lc], ua, ub]; +AssertSymmetricIncreasing [ddgtu[ua,ub,lc,ld], ua, ub]; +AssertSymmetricIncreasing [ddgtu[ua,ub,lc,ld], lc, ld]; + +DefineConnection [CD, PD, G]; +DefineConnection [CDt, PD, Gt]; + +(* Use the CartGrid3D variable names *) +x1=x; x2=y; x3=z; + +(* Use the ADMBase variable names *) +admg11=gxx; admg12=gxy; admg22=gyy; admg13=gxz; admg23=gyz; admg33=gzz; +admK11=kxx; admK12=kxy; admK22=kyy; admK13=kxz; admK23=kyz; admK33=kzz; +admalpha=alp; +admdtalpha=dtalp; +admbeta1=betax; admbeta2=betay; admbeta3=betaz; +admdtbeta1=dtbetax; admdtbeta2=dtbetay; admdtbeta3=dtbetaz; +qmsw=W; +(*alpha=admalpha;*) + +(* Use the TmunuBase variable names *) +T00=eTtt; +T01=eTtx; T02=eTty; T03=eTtz; +T11=eTxx; T12=eTxy; T22=eTyy; T13=eTxz; T23=eTyz; T33=eTzz; + + + +(******************************************************************************) +(* Expressions *) +(******************************************************************************) + +(* enum constants for conformalMethod; these must be consistent + with the definition of the Cactus parameter conformalMethod *) +CMphi = 0; +CMW = 1; + +detgExpr = Det [MatrixOfComponents [g [la,lb]]]; +ddetgExpr[la_] = + Sum [D[Det[MatrixOfComponents[g[la, lb]]], X] PD[X, la], + {X, Union[Flatten[MatrixOfComponents[g[la, lb]]]]}]; + +detgtExpr = Det [MatrixOfComponents [gt[la,lb]]]; +ddetgtExpr[la_] = + Sum [D[Det[MatrixOfComponents[gt[la, lb]]], X] PD[X, la], + {X, Union[Flatten[MatrixOfComponents[gt[la, lb]]]]}]; + +etaExpr = SpatialBetaDriverRadius / Max [r, SpatialBetaDriverRadius]; +thetaExpr = Min [Exp [1 - r / SpatialShiftGammaCoeffRadius], 1]; + + + +(******************************************************************************) +(* Groups *) +(******************************************************************************) + +evolvedGroups = + {SetGroupName [CreateGroupFromTensor [phi ], prefix <> "log_confac"], + SetGroupName [CreateGroupFromTensor [gt[la,lb]], prefix <> "metric" ], + SetGroupName [CreateGroupFromTensor [Xt[ua] ], prefix <> "Gamma" ], + SetGroupName [CreateGroupFromTensor [trK ], prefix <> "trace_curv"], + SetGroupName [CreateGroupFromTensor [At[la,lb]], prefix <> "curv" ], + SetGroupName [CreateGroupFromTensor [alpha ], prefix <> "lapse" ], +(*SetGroupName [CreateGroupFromTensor [A ], prefix <> "dtlapse" ],*) + SetGroupName [CreateGroupFromTensor [Kdot[la, lb]], prefix <> "Kdot" ], + SetGroupName [CreateGroupFromTensor [Xtdot[ua]], prefix <> "Xtdot" ], + SetGroupName [CreateGroupFromTensor [phidot], prefix <> "phidot" ], + SetGroupName [CreateGroupFromTensor [beta[ua] ], prefix <> "shift" ], + SetGroupName [CreateGroupFromTensor [B[ua] ], prefix <> "dtshift" ], + IfCCZ4[SetGroupName[CreateGroupFromTensor[Theta], prefix <> "Theta"]]}; +evaluatedGroups = + {SetGroupName [CreateGroupFromTensor [H ], prefix <> "Ham"], + SetGroupName [CreateGroupFromTensor [M[la] ], prefix <> "mom"], + SetGroupName [CreateGroupFromTensor [term1 ], prefix <> "term1"], + SetGroupName [CreateGroupFromTensor [term2 ], prefix <> "term2"], + SetGroupName [CreateGroupFromTensor [term3 ], prefix <> "term3"], + SetGroupName [CreateGroupFromTensor [cS ], prefix <> "cons_detg"], + SetGroupName [CreateGroupFromTensor [cXt[ua]], prefix <> "cons_Gamma"], + SetGroupName [CreateGroupFromTensor [cA ], prefix <> "cons_traceA"]}; + +declaredGroups = Join [evolvedGroups, evaluatedGroups]; +declaredGroupNames = Map [First, declaredGroups]; + + + +extraGroups = + {{"Grid::coordinates", {x, y, z, r}}, + {"ADMBase::metric", {gxx, gxy, gxz, gyy, gyz, gzz}}, + {"ADMBase::curv", {kxx, kxy, kxz, kyy, kyz, kzz}}, + {"ADMBase::lapse", {alp}}, + {"ADMBase::dtlapse", {dtalp}}, + {"ADMBase::shift", {betax, betay, betaz}}, + {"ADMBase::dtshift", {dtbetax, dtbetay, dtbetaz}}, + {"QuasiMaximalSlicing::W", { W }}, + {"TmunuBase::stress_energy_scalar", {eTtt}}, + {"TmunuBase::stress_energy_vector", {eTtx, eTty, eTtz}}, + {"TmunuBase::stress_energy_tensor", {eTxx, eTxy, eTxz, eTyy, eTyz, eTzz}} +}; + +groups = Join [declaredGroups, extraGroups]; + + + +(******************************************************************************) +(* Initial data *) +(******************************************************************************) + +initialCalc = +{ + Name -> thorn <> "_Minkowski", + Schedule -> {"IN ADMBase_InitialData"}, + ConditionalOnKeyword -> {"my_initial_data", "Minkowski"}, + Equations -> + { + phi -> IfThen[conformalMethod==CMW, 1, 0], + gt[la,lb] -> KD[la,lb], + trK -> 0, + At[la,lb] -> 0, + Xt[ua] -> 0, + (*alpha -> 1, + A -> 0,*) + beta[ua] -> 0, + B[ua] -> 0, + IfCCZ4[Theta -> 0] + } +}; + + + +(******************************************************************************) +(* Split a calculation *) +(******************************************************************************) + +PartialCalculation[calc_, suffix_, updates_, evolVars_] := +Module[ + {name, calc1, replaces, calc2, vars, patterns, eqs, calc3}, + (* Add suffix to name *) + name = lookup[calc, Name] <> suffix; + calc1 = mapReplace[calc, Name, name]; + (* Replace some entries in the calculation *) + (* replaces = Map[Function[rule, mapReplace[#, rule[[1]], rule[[2]]]&], updates]; *) + replaces = updates //. (lhs_ -> rhs_) -> (mapReplace[#, lhs, rhs]&); + calc2 = Apply[Composition, replaces][calc1]; + (* Remove unnecessary equations *) + vars = Join[evolVars, lookup[calc2, Shorthands]]; + patterns = Replace[vars, { Tensor[n_,__] -> Tensor[n,__] , + dot[Tensor[n_,__]] -> dot[Tensor[n,__]]}, 1]; + eqs = FilterRules[lookup[calc, Equations], patterns]; + calc3 = mapReplace[calc2, Equations, eqs]; + calc3 +]; + + + +(******************************************************************************) +(* Convert from ADMBase *) +(******************************************************************************) + +convertFromADMBaseCalc = +{ + Name -> thorn <> "_convertFromADMBase", + Schedule -> {"AT initial AFTER ADMBase_PostInitial"}, + ConditionalOnKeyword -> {"my_initial_data", "ADMBase"}, + Shorthands -> {g[la,lb], detg, gu[ua,ub], em4phi}, + Equations -> + { + g[la,lb] -> admg[la,lb], + detg -> detgExpr, + gu[ua,ub] -> 1/detg detgExpr MatrixInverse [g[ua,ub]], + + phi -> IfThen[conformalMethod==CMW, detg^(-1/6), Log[detg]/12], + em4phi -> IfThen[conformalMethod==CMW, phi^2, Exp[-4 phi]], + gt[la,lb] -> em4phi g[la,lb], + + trK -> gu[ua,ub] admK[la,lb], + At[la,lb] -> em4phi (admK[la,lb] - (1/3) g[la,lb] trK), + + alpha -> admalpha, + + beta[ua] -> admbeta[ua], + + IfCCZ4[Theta -> 0] + } +}; + +convertFromADMBaseGammaCalc = +{ + Name -> thorn <> "_convertFromADMBaseGamma", + Schedule -> {"AT initial AFTER " <> thorn <> "_convertFromADMBase"}, + ConditionalOnKeyword -> {"my_initial_data", "ADMBase"}, + (* + Where -> InteriorNoSync, + *) + (* Do not synchronise right after this routine; instead, synchronise + after extrapolating *) + Where -> Interior, + (* Synchronise after this routine, so that the refinement boundaries + are set correctly before extrapolating. (We will need to + synchronise again after extrapolating because extrapolation does + not fill ghost zones, but this is irrelevant here.) *) + Shorthands -> {dir[ua], + detgt, gtu[ua,ub], Gt[ua,lb,lc], theta}, + Equations -> + { + dir[ua] -> Sign[beta[ua]], + + detgt -> 1 (* detgtExpr *), + gtu[ua,ub] -> 1/detgt detgtExpr MatrixInverse [gt[ua,ub]], + Gt[ua,lb,lc] -> 1/2 gtu[ua,ud] + (PD[gt[lb,ld],lc] + PD[gt[lc,ld],lb] - PD[gt[lb,lc],ld]), + Xt[ua] -> gtu[ub,uc] Gt[ua,lb,lc], + +(* + A -> - admdtalpha / (harmonicF alpha^harmonicN) (LapseAdvectionCoeff - 1), +*) + (* If LapseACoeff=0, then A is not evolved, in the sense that it + does not influence the time evolution of other variables. *) + (*A -> IfThen [LapseACoeff != 0, + 1 / (- harmonicF alpha^harmonicN) + (+ admdtalpha + - LapseAdvectionCoeff Upwind[beta[ua], alpha, la]), + 0],*) + + theta -> thetaExpr, + + (* If ShiftBCoeff=0 or theta ShiftGammaCoeff=0, then B^i is not + evolved, in the sense that it does not influence the time + evolution of other variables. *) + B[ua] -> IfThen [ShiftGammaCoeff ShiftBCoeff != 0, + 1 / (theta ShiftGammaCoeff) + (+ admdtbeta[ua] + - ShiftAdvectionCoeff Upwind[beta[ub], beta[ua], lb]), + 0] + } +}; + +(* Initialise the Gamma variables to 0. This is necessary with + multipatch because convertFromADMBaseGamma does not perform the + conversion in the boundary points, and the order in which symmetry + (interpatch) and outer boundary conditions is applied means that + points which are both interpatch and symmetry points are never + initialised. *) +initGammaCalc = +{ + Name -> thorn <> "_InitGamma", + Schedule -> {"AT initial BEFORE " <> thorn <> "_convertFromADMBaseGamma"}, + ConditionalOnKeyword -> {"my_initial_data", "ADMBase"}, + Where -> Everywhere, + Equations -> + { + Xt[ua] -> 0, + (*A -> 0,*) + B[ua] -> 0 + } +}; + + + +(******************************************************************************) +(* Convert to ADMBase *) +(******************************************************************************) + +convertToADMBaseCalc = +{ + Name -> thorn <> "_convertToADMBase", + Schedule -> {"IN " <> thorn <> "_convertToADMBaseGroup"}, + Where -> Everywhere, + Shorthands -> {e4phi}, + Equations -> + { + e4phi -> IfThen[conformalMethod==CMW, 1/phi^2, Exp[4 phi]], + admg[la,lb] -> e4phi gt[la,lb], + admK[la,lb] -> e4phi At[la,lb] + (1/3) admg[la,lb] trK, + admalpha -> alpha, + admbeta[ua] -> beta[ua] + } +}; + +convertToADMBaseDtLapseShiftCalc = +{ + Name -> thorn <> "_convertToADMBaseDtLapseShift", + Schedule -> {"IN " <> thorn <> "_convertToADMBaseGroup"}, + ConditionalOnKeyword -> {"dt_lapse_shift_method", "correct"}, + Where -> Interior, + Shorthands -> {dir[ua], detgt, gtu[ua,ub], eta, theta, em4phi, Ddetgt[la]}, + Equations -> + { + dir[ua] -> Sign[beta[ua]], + + detgt -> 1 (* detgtExpr *), + (* This leads to simpler code... *) + gtu[ua,ub] -> 1/detgt detgtExpr MatrixInverse [gt[ua,ub]], + em4phi -> IfThen[conformalMethod==CMW, phi^2, Exp[-4 phi]], + + eta -> etaExpr, + theta -> thetaExpr, + + (* Ddetgt should be zero analytically, but we're not assuming it here. Change commenting to assume it.*) + Ddetgt[la] -> gtu[uk,ul] PD[gt[lk,ll],la], + (*Ddetgt[la] -> 0,*) + + (* see RHS *) +(* + admdtalpha -> - harmonicF alpha^harmonicN + ((1 - LapseAdvectionCoeff) A + LapseAdvectionCoeff trK) + + LapseAdvectionCoeff beta[ua] PDu[alpha,la], + admdtalpha -> - harmonicF alpha^harmonicN + (+ LapseACoeff A + + ((1 - LapseACoeff) + (trK - IfCCZ4[2 Theta, 0]))) + + LapseAdvectionCoeff Upwind[beta[ua], alpha, la], +*) + admdtbeta[ua] -> IfThen[harmonicShift, + - 1/2 gtu[ua,uj] em4phi alpha + (- 2 alpha IfThen[conformalMethod==CMW,1/phi,-2] PD[phi,lj] + + 2 PD[alpha,lj] + + alpha (Ddetgt[lj] - 2 gtu[uk,ul] PD[gt[lj,lk],ll])), + (* else *) + + theta ShiftGammaCoeff + (+ ShiftBCoeff B[ua] + + (1 - ShiftBCoeff) + (Xt[ua] - eta BetaDriver beta[ua]))] + + ShiftAdvectionCoeff Upwind[beta[ub], beta[ua], lb] + } +}; + +convertToADMBaseDtLapseShiftBoundaryCalc = +{ + Name -> thorn <> "_convertToADMBaseDtLapseShiftBoundary", + Schedule -> {"IN " <> thorn <> "_convertToADMBaseGroup"}, + ConditionalOnKeyword -> {"dt_lapse_shift_method", "correct"}, + Where -> BoundaryWithGhosts, + Shorthands -> {detgt, gtu[ua,ub], eta, theta}, + Equations -> + { + detgt -> 1 (* detgtExpr *), + (* This leads to simpler code... *) + gtu[ua,ub] -> 1/detgt detgtExpr MatrixInverse [gt[ua,ub]], + + eta -> etaExpr, + theta -> thetaExpr, + + (* see RHS, but omit derivatives near the boundary *) +(* + admdtalpha -> - harmonicF alpha^harmonicN + ((1 - LapseAdvectionCoeff) A + LapseAdvectionCoeff trK), + admdtalpha -> - harmonicF alpha^harmonicN + (+ LapseACoeff A + + ((1 - LapseACoeff) + (trK - IfCCZ4[2 Theta, 0]))), +*) + admdtbeta[ua] -> IfThen[harmonicShift, + 0, + (* else *) + + theta ShiftGammaCoeff + (+ ShiftBCoeff B[ua] + + (1 - ShiftBCoeff) + (Xt[ua] - eta BetaDriver beta[ua]))] + } +}; + +convertToADMBaseFakeDtLapseShiftCalc = +{ + Name -> thorn <> "_convertToADMBaseFakeDtLapseShift", + Schedule -> {"IN " <> thorn <> "_convertToADMBaseGroup"}, + ConditionalOnKeyword -> {"dt_lapse_shift_method", "noLapseShiftAdvection"}, + Where -> Everywhere, + Shorthands -> {detgt, gtu[ua,ub], eta, theta}, + Equations -> + { + detgt -> 1 (* detgtExpr *), + (* This leads to simpler code... *) + gtu[ua,ub] -> 1/detgt detgtExpr MatrixInverse [gt[ua,ub]], + + eta -> etaExpr, + theta -> thetaExpr, + + (* see RHS, but omit derivatives everywhere (which is wrong, but + faster, since it does not require synchronisation or boundary + conditions) *) +(* + admdtalpha -> - harmonicF alpha^harmonicN + ((1 - LapseAdvectionCoeff) A + LapseAdvectionCoeff trK), + admdtalpha -> - harmonicF alpha^harmonicN + (+ LapseACoeff A + + ((1 - LapseACoeff) + (trK - IfCCZ4[2 Theta, 0]))), +*) + admdtbeta[ua] -> IfThen[harmonicShift, + 0, + (* else *) + + theta ShiftGammaCoeff + (+ ShiftBCoeff B[ua] + + (1 - ShiftBCoeff) + (Xt[ua] - eta BetaDriver beta[ua]))] + } +}; + +(******************************************************************************) +(* Evolution equations *) +(******************************************************************************) + +evolCalc = +{ + Name -> thorn <> "_RHS", + Schedule -> {"IN " <> thorn <> "_evolCalcGroup"}, + (* + Where -> Interior, + *) + (* Synchronise the RHS grid functions after this routine, so that + the refinement boundaries are set correctly before applying the + radiative boundary conditions. *) + Where -> InteriorNoSync, + Shorthands -> {dir[ua], + detgt, gtu[ua,ub], + Gt[ua,lb,lc], Gtl[la,lb,lc], Gtlu[la,lb,uc], G[ua, lb, lc], ddetg[la], Xtn[ua], + Rt[la,lb], Rphi[la,lb], R[la,lb], + Atm[ua,lb], Atu[ua,ub], + e4phi, em4phi, cdphi[la], cdphi2[la,lb], g[la,lb], detg, + gu[ua,ub], Ats[la,lb], trAts, eta, theta, + K[la, lb], Km[la, ub], + rho, S[la], trS, fac1, fac2, dottrK, dotXt[ua], + epsdiss[ua], IfCCZ4[Z[ua]], IfCCZ4[dotTheta], Ddetgt[la]}, + Equations -> + { + dir[ua] -> Sign[beta[ua]], + + detgt -> 1 (* detgtExpr *), + + (* This leads to simpler code... *) + gtu[ua,ub] -> 1/detgt detgtExpr MatrixInverse [gt[ua,ub]], + Gtl[la,lb,lc] -> 1/2 + (PD[gt[lb,la],lc] + PD[gt[lc,la],lb] - PD[gt[lb,lc],la]), + Gtlu[la,lb,uc] -> gtu[uc,ud] Gtl[la,lb,ld], + Gt[ua,lb,lc] -> gtu[ua,ud] Gtl[ld,lb,lc], + + (* The conformal connection functions calculated from the conformal metric, + used instead of Xt where no derivatives of Xt are taken *) + Xtn[ui] -> gtu[uj,uk] Gt[ui,lj,lk], + + e4phi -> IfThen[conformalMethod==CMW, 1/phi^2, Exp[4 phi]], + em4phi -> 1 / e4phi, + g[la,lb] -> e4phi gt[la,lb], + detg -> detgExpr, + gu[ua,ub] -> em4phi gtu[ua,ub], + ddetg[la] -> 4 detgt e4phi PD[phi,la], + G[ua,lb,lc] -> Gt[ua,lb,lc] + + 1/(2 detg) (+ KD[ua,lb] ddetg[lc] + KD[ua,lc] ddetg[lb] + - (1/3) g[lb,lc] gu[ua,ud] ddetg[ld]), + K[la, lb] -> e4phi At[la, lb] + (1/3) g[la, lb] trK, + Km[la, ub] -> gu[ub, uc] K[la, lc], + + (* The Z quantities *) + (* gr-qc:1106.2254 (2011), eqn. (23) *) + IfCCZ4[ + Z[ud] -> (1/2) gu[ua,ud] (- PD[gt[la,lb],lc] gtu[ub,uc] + gt[la,lc] Xt[uc]) + ], + + (* PRD 62, 044034 (2000), eqn. (18) *) + (* Adding Z term by changing Xtn to Xt *) + Rt[li,lj] -> - (1/2) gtu[ul,um] PD[gt[li,lj],ll,lm] + + (1/2) gt[lk,li] PD[Xt[uk],lj] + + (1/2) gt[lk,lj] PD[Xt[uk],li] + + (1/2) Xtn[uk] Gtl[li,lj,lk] + + (1/2) Xtn[uk] Gtl[lj,li,lk] + + (+ Gt[uk,li,ll] Gtlu[lj,lk,ul] + + Gt[uk,lj,ll] Gtlu[li,lk,ul] + + Gt[uk,li,ll] Gtlu[lk,lj,ul]), + + fac1 -> IfThen[conformalMethod==CMW, -1/(2 phi), 1], + cdphi[la] -> fac1 CDt[phi,la], + fac2 -> IfThen[conformalMethod==CMW, 1/(2 phi^2), 0], + cdphi2[la,lb] -> fac1 CDt[phi,la,lb] + fac2 CDt[phi,la] CDt[phi,lb], + + (* PRD 62, 044034 (2000), eqn. (15) *) + Rphi[li,lj] -> - 2 cdphi2[lj,li] + - 2 gt[li,lj] gtu[ul,un] cdphi2[ll,ln] + + 4 cdphi[li] cdphi[lj] + - 4 gt[li,lj] gtu[ul,un] cdphi[ln] cdphi[ll], + + Atm[ua,lb] -> gtu[ua,uc] At[lc,lb], + Atu[ua,ub] -> Atm[ua,lc] gtu[ub,uc], + + R[la,lb] -> Rt[la,lb] + Rphi[la,lb], + IfCCZ4[ + R[la,lb] -> R[la,lb] + (2/phi) (+ g[la,lc] Z[uc] PD[phi,lb] + + g[lb,lc] Z[uc] PD[phi,la] - g[la,lb] Z[uc] PD[phi,lc]) + + e4phi Z[uc] PD[gt[la,lb],lc] + ], + + (* Matter terms *) + + (* rho = n^a n^b T_ab *) + rho -> addMatter + (1/alpha^2 (T00 - 2 beta[ui] T0[li] + beta[ui] beta[uj] T[li,lj])), + + (* S_i = -p^a_i n^b T_ab, where p^a_i = delta^a_i + n^a n_i *) + S[li] -> addMatter (-1/alpha (T0[li] - beta[uj] T[li,lj])), + + (* trS = gamma^ij T_ij *) + trS -> addMatter (em4phi gtu[ui,uj] T[li,lj]), + + (* RHS terms *) + + (* PRD 62, 044034 (2000), eqn. (10) *) + (* PRD 67 084023 (2003), eqn. (16) and (23) *) + dot[phi] -> IfThen[conformalMethod==CMW, 1/3 phi, -1/6] + (alpha trK - PD[beta[ua],la]), + phidot -> IfThen[conformalMethod==CMW, 1/3 phi, -1/6] + (alpha trK - PD[beta[ua],la]), + + (* PRD 62, 044034 (2000), eqn. (9) *) + (* gr-qc:1106.2254 (2011), eqn. (14) *) + (* removing trA from Aij ensures that detg = 1 *) + dot[gt[la,lb]] -> - 2 alpha (At[la,lb] - IfCCZ4[(1/3) At[lc,ld] gtu[uc,ud] gt[la,lb], 0]) + + gt[la,lc] PD[beta[uc],lb] + gt[lb,lc] PD[beta[uc],la] + - (2/3) gt[la,lb] PD[beta[uc],lc], + (* PRD 62, 044034 (2000), eqn. (20) *) + (* PRD 67 084023 (2003), eqn (26) *) + (* gr-qc:1106.2254 (2011), eqn. (19) *) + (* Adding Z terms by changing Xtn to Xt, + also adding extra Z and Theta terms *) + dotXt[ui] -> - 2 Atu[ui,uj] PD[alpha,lj] + + 2 alpha (+ Gt[ui,lj,lk] Atu[uk,uj] + - (2/3) gtu[ui,uj] PD[trK,lj] + + 6 Atu[ui,uj] cdphi[lj]) + + gtu[uj,ul] PD[beta[ui],lj,ll] + + (1/3) gtu[ui,uj] PD[beta[ul],lj,ll] + - Xtn[uj] PD[beta[ui],lj] + + (2/3) Xtn[ui] PD[beta[uj],lj] + + IfCCZ4[ + + GammaShift 2 e4phi (- Z[uj] PD[beta[ui],lj] + + (2/3) Z[ui] PD[beta[uj],lj]) + - (4/3) alpha e4phi Z[ui] trK + + 2 gtu[ui,uj] (+ alpha PD[Theta,lj] + - Theta PD[alpha,lj]) + - 2 alpha e4phi dampk1 Z[ui], + 0] + (* Equation (4.28) in Baumgarte & Shapiro (Phys. Rept. 376 (2003) 41-131) *) + + addMatter (- 16 Pi alpha gtu[ui,uj] S[lj]), + dot[Xt[ui]] -> dotXt[ui], + Xtdot[ui] -> dotXt[ui], + + (* gr-qc:1106.2254 (2011), eqn. (18) *) + IfCCZ4[ + dotTheta -> + - PD[alpha,la] Z[ua] - dampk1 (2 + dampk2) alpha Theta + + (1/2) alpha (gu[ua,ub] R[la,lb] - Atm[ua,lb] Atm[ub,la] + (2/3) trK^2 - 2 trK Theta) + + addMatter (- 8 Pi alpha rho) + ], + + IfCCZ4[ + dot[Theta] -> dotTheta + ], + + (* PRD 62, 044034 (2000), eqn. (11) *) + (* gr-qc:1106.2254 (2011), eqn. (17) *) + (* Adding the RHS of Theta to K, because K_Z4 = K_BSSN + 2 Theta *) + (* Also adding the Z term, as it has to cancel with the one in Theta *) + (*dottrK -> - em4phi ( gtu[ua,ub] ( PD[alpha,la,lb] + + 2 cdphi[la] PD[alpha,lb] ) + - Xtn[ua] PD[alpha,la] ) + + alpha (Atm[ua,lb] Atm[ub,la] + (1/3) trK^2) + + IfCCZ4[ + + 2 dotTheta + 2 PD[alpha,la] Z[ua] + + dampk1 (1 - dampk2) alpha Theta, + 0]*) + term1 -> - em4phi ( gtu[ua,ub] ( PD[alpha,la,lb] + + 2 cdphi[la] PD[alpha,lb] ) + - Xtn[ua] PD[alpha,la] ), + term2 -> + alpha (Atm[ua,lb] Atm[ub,la] + (1/3) trK^2), + term3 -> IfCCZ4[+ 2 dotTheta + 2 PD[alpha,la] Z[ua] + + dampk1 (1 - dampk2) alpha Theta, 0], + dottrK -> term1 + term2 + term3 + (* Equation (4.21) in Baumgarte & Shapiro (Phys. Rept. 376 (2003) 41-131) *) + + addMatter (4 Pi alpha (rho + trS)), + dot[trK] -> KEvolFactor dottrK, + + (* PRD 62, 044034 (2000), eqn. (12) *) + (* TODO: Should we use the Hamiltonian constraint to make Rij tracefree? *) + (* gr-qc:1106.2254 (2011), eqn. (15) *) + (* Adding Z terms in the Ricci and Theta terms *) + Ats[la,lb] -> - CDt[alpha,la,lb] + + + 2 (PD[alpha,la] cdphi[lb] + PD[alpha,lb] cdphi[la] ) + + alpha R[la,lb], + trAts -> gu[ua,ub] Ats[la,lb], + dot[At[la,lb]] -> + em4phi (+ Ats[la,lb] - (1/3) g[la,lb] trAts ) + + alpha (+ ((trK - IfCCZ4[2 Theta, 0]) + At[la,lb]) + - 2 At[la,lc] Atm[uc,lb]) + + At[la,lc] PD[beta[uc],lb] + At[lb,lc] PD[beta[uc],la] + - (2/3) At[la,lb] PD[beta[uc],lc] + (* Equation (4.23) in Baumgarte & Shapiro (Phys. Rept. 376 (2003) 41-131) *) + + addMatter (- em4phi alpha 8 Pi + (T[la,lb] - (1/3) g[la,lb] trS)), + Kdot[la, lb] -> -CD[alpha, la, lb] + alpha (R[la, lb] + trK K[la, lb] - 2 K[la, lc] Km[lb, uc]), + + + eta -> etaExpr, + theta -> thetaExpr, + + (* Ddetgt should be zero analytically, but we're not assuming it here. Change commenting to assume it.*) + Ddetgt[la] -> gtu[uk,ul] PD[gt[lk,ll],la], + (*Ddetgt[la] -> 0,*) + + (* dot[beta[ua]] -> eta Xt[ua], *) + (* dot[beta[ua]] -> ShiftGammaCoeff alpha^ShiftAlphaPower B[ua], *) + dot[beta[ua]] -> IfThen[harmonicShift, + - 1/2 gtu[ua,uj] em4phi alpha + (- 2 alpha IfThen[conformalMethod==CMW,1/phi,-2] PD[phi,lj] + + 2 PD[alpha,lj] + + alpha (Ddetgt[lj] - 2 gtu[uk,ul] PD[gt[lj,lk],ll])), + (* else *) + + theta ShiftGammaCoeff + (+ ShiftBCoeff B[ua] + + (1 - ShiftBCoeff) + (Xt[ua] - eta BetaDriver beta[ua]))], + + dot[B[ua]] -> + ShiftBCoeff (dotXt[ua] - eta BetaDriver B[ua]) + (* Note that this dotXt[ua] is not yet \partial_t \tilde \Gamma^i, because the + advection term has not yet been added. It is actually + \partial_t \tilde \Gamma^i - \beta^j \partial_j \tilde \Gamma^i *) + } +}; + +lapseEvolCalc = { + Name -> thorn <> "_lapse_evol", + Schedule -> {"IN " <> thorn <> "_evolCalcGroup"}, + (* + Where -> Interior, + *) + (* Synchronise the RHS grid functions after this routine, so that + the refinement boundaries are set correctly before applying the + radiative boundary conditions. *) + Where -> InteriorNoSync, + Shorthands -> {}, + Equations -> + { + dot[alpha] -> - harmonicF alpha^harmonicN (+ trK - IfCCZ4[2 Theta, 0] + AlphaDriver (alpha - 1)) + WFactor qmsw + } +}; + +advectCalc = +{ + Name -> thorn <> "_Advect", + Schedule -> {"IN " <> thorn <> "_evolCalcGroup " <> + "AFTER (" <> thorn <> "_RHS " <> thorn <> "_lapse_evol " <> thorn <> "_RHS1 " <> thorn <> "_RHS2)"}, + (* + Where -> Interior, + *) + (* Synchronise the RHS grid functions after this routine, so that + the refinement boundaries are set correctly before applying the + radiative boundary conditions. *) + Where -> InteriorNoSync, + Shorthands -> {dir[ua]}, + Equations -> + { + dir[ua] -> Sign[beta[ua]], + + dot[phi] -> dot[phi] + Upwind[beta[ua], phi, la], + + dot[gt[la,lb]] -> dot[gt[la,lb]] + Upwind[beta[uc], gt[la,lb], lc], + + dot[Xt[ui]] -> dot[Xt[ui]] + Upwind[beta[uj], Xt[ui], lj], + + IfCCZ4[ + dot[Theta] -> dot[Theta] + Upwind[beta[ua], Theta, la] + ], + + dot[trK] -> dot[trK] + Upwind[beta[ua], trK, la], + + dot[At[la,lb]] -> dot[At[la,lb]] + Upwind[beta[uc], At[la,lb], lc], + + (* + dot[alpha] -> dot[alpha] + + LapseAdvectionCoeff Upwind[beta[ua], alpha, la], + + dot[A] -> dot[A] + + LapseACoeff ( + + LapseAdvectionCoeff Upwind[beta[ua], A, la] + + (1 - LapseAdvectionCoeff) Upwind[beta[ua], trK, la]), + *) + + dot[beta[ua]] -> dot[beta[ua]] + + ShiftAdvectionCoeff Upwind[beta[ub], beta[ua], lb], + + dot[B[ua]] -> dot[B[ua]] + + ShiftBCoeff ( + + ShiftAdvectionCoeff Upwind[beta[ub], B[ua], lb] + + ((1 - ShiftAdvectionCoeff) + Upwind[beta[ub], Xt[ua], lb])) + (* Note that the advection term \beta^j \partial_j \tilde \Gamma^i is not + subtracted here when ShiftAdvectionCoefficient == 1 because it was + implicitly subtracted before (see comment in previous calculation of + dot[B[ua]]. *) + } +}; + +evolCalc1 = PartialCalculation[evolCalc, "1", + { + ConditionalOnKeyword -> {"RHS_calculation", "split"} + }, + { + dot[phi], + dot[gt[la,lb]], + dot[Xt[ui]], + term1, term2, term3, + dot[trK], + dot[beta[ua]], + dot[B[ua]], + IfCCZ4[dot[Theta]] + }]; + +evolCalc2 = PartialCalculation[evolCalc, "2", + { + ConditionalOnKeyword -> {"RHS_calculation", "split"} + }, + { + dot[At[la,lb]] + }]; + +dissCalc = +{ + Name -> thorn <> "_Dissipation", + Schedule -> {"IN " <> thorn <> "_evolCalcGroup " <> + "AFTER (" <> thorn <> "_RHS " <> thorn <> "_RHS1 " <> thorn <> "_RHS2)"}, + ConditionalOnKeyword -> {"apply_dissipation", "always"}, + Where -> InteriorNoSync, + Shorthands -> {epsdiss[ua]}, + Equations -> + { + epsdiss[ua] -> EpsDiss, + Sequence@@Table[ + dot[var] -> dot[var] + epsdiss[ux] PDdiss[var,lx], + {var, {phi, gt[la,lb], Xt[ui], IfCCZ4[Theta], trK, At[la,lb], + (*alpha, A,*) beta[ua], B[ua]}}] + } +}; + +dissCalcs = +Table[ +{ + Name -> thorn <> "_Dissipation_" <> ToString[var /. {Tensor[n_,__] -> n}], + Schedule -> {"IN " <> thorn <> "_evolCalcGroup " <> + "AFTER (" <> thorn <> "_RHS " <> thorn <> "_RHS1 " <> thorn <> "_RHS2)"}, + ConditionalOnKeyword -> {"apply_dissipation", "always"}, + Where -> InteriorNoSync, + Shorthands -> {epsdiss[ua]}, + Equations -> + { + epsdiss[ua] -> EpsDiss, + dot[var] -> dot[var] + epsdiss[ux] PDdiss[var,lx] + } +}, + {var, {phi, gt[la,lb], Xt[ui], IfCCZ4[Theta], trK, At[la,lb], + (*alpha, A,*) beta[ua], B[ua]}} +]; + +RHSStaticBoundaryCalc = +{ + Name -> thorn <> "_RHSStaticBoundary", + Schedule -> {"IN MoL_CalcRHS"}, + ConditionalOnKeyword -> {"my_rhs_boundary_condition", "static"}, + Where -> Boundary, + Equations -> + { + dot[phi] -> 0, + dot[gt[la,lb]] -> 0, + dot[trK] -> 0, + dot[At[la,lb]] -> 0, + dot[Xt[ua]] -> 0, + (*dot[alpha] -> 0, + dot[A] -> 0,*) + dot[beta[ua]] -> 0, + dot[B[ua]] -> 0, + IfCCZ4[dot[Theta] -> 0] + } +}; + +(* Initialise the RHS variables in analysis in case they are going to + be output - the noninterior points cannot be filled, so we define + them to be zero *) +initRHSCalc = +{ + Name -> thorn <> "_InitRHS", + Schedule -> {"AT analysis BEFORE " <> thorn <> "_evolCalcGroup"}, + Where -> Everywhere, + Equations -> + { + dot[phi] -> 0, + dot[gt[la,lb]] -> 0, + dot[trK] -> 0, + dot[At[la,lb]] -> 0, + dot[Xt[ua]] -> 0, + (*dot[alpha] -> 0, + dot[A] -> 0,*) + dot[beta[ua]] -> 0, + dot[B[ua]] -> 0, + IfCCZ4[dot[Theta] -> 0] + } +}; + +RHSRadiativeBoundaryCalc = +{ + Name -> thorn <> "_RHSRadiativeBoundary", + Schedule -> {"IN MoL_CalcRHS"}, + ConditionalOnKeyword -> {"my_rhs_boundary_condition", "radiative"}, + Where -> Boundary, + Shorthands -> {dir[ua], + detgt, gtu[ua,ub], em4phi, gu[ua,ub], + nn[la], nu[ua], nlen, nlen2, su[ua], + vg}, + Equations -> + { + dir[ua] -> Sign[normal[ua]], + + detgt -> 1 (* detgtExpr *), + gtu[ua,ub] -> 1/detgt detgtExpr MatrixInverse [gt[ua,ub]], + em4phi -> IfThen[conformalMethod==CMW, phi^2, Exp[-4 phi]], + gu[ua,ub] -> em4phi gtu[ua,ub], + + nn[la] -> Euc[la,lb] normal[ub], + nu[ua] -> gu[ua,ub] nn[lb], + nlen2 -> nu[ua] nn[la], + nlen -> Sqrt[nlen2], + su[ua] -> nu[ua] / nlen, + + vg -> Sqrt[harmonicF], + + dot[phi] -> - vg su[uc] PDo[phi ,lc], + dot[gt[la,lb]] -> - su[uc] PDo[gt[la,lb],lc], + dot[trK] -> - vg su[uc] PDo[trK ,lc], + dot[At[la,lb]] -> - su[uc] PDo[At[la,lb],lc], + dot[Xt[ua]] -> - su[uc] PDo[Xt[ua] ,lc], + (*dot[alpha] -> - vg su[uc] PDo[alpha ,lc], + dot[A] -> - vg su[uc] PDo[A ,lc],*) + dot[beta[ua]] -> - su[uc] PDo[beta[ua] ,lc], + dot[B[ua]] -> - su[uc] PDo[B[ua] ,lc], + IfCCZ4[ + dot[Theta] -> - vg su[uc] PDo[Theta ,lc] + ] + } +}; + +enforceCalc = +{ + Name -> thorn <> "_enforce", + Schedule -> {"IN MoL_PostStepModify"}, + Shorthands -> {detgt, gtu[ua,ub], trAt}, + Equations -> + { + (* The following comment is still interesting, but is not correct + any more since it is now scheduled in MoL_PostStepModify instead: + + Enforcing the constraints needs to be a projection, because it + is applied in MoL_PostStep and may thus be applied multiple + times, not only during time evolution. Therefore detgt has to + be calculated correctly, without assuming that det gt_ij = 1, + which is not always the case (since we don't enforce it). On + the other hand, this may not be so important... *) + detgt -> 1 (* detgtExpr *), + gtu[ua,ub] -> 1/detgt detgtExpr MatrixInverse [gt[ua,ub]], + + trAt -> gtu[ua,ub] At[la,lb], + + At[la,lb] -> At[la,lb] - (1/3) gt[la,lb] trAt(*, + + alpha -> Max[alpha, MinimumLapse]*) + } +}; + +(******************************************************************************) +(* Boundary conditions *) +(******************************************************************************) + +boundaryCalc = +{ + Name -> thorn <> "_boundary", + Schedule -> {"IN MoL_PostStep"}, + ConditionalOnKeyword -> {"my_boundary_condition", "Minkowski"}, + Where -> BoundaryWithGhosts, + Equations -> + { + phi -> IfThen[conformalMethod==CMW, 1, 0], + gt[la,lb] -> KD[la,lb], + trK -> 0, + At[la,lb] -> 0, + Xt[ua] -> 0, + (*alpha -> 1, + A -> 0,*) + beta[ua] -> 0, + B[ua] -> 0, + IfCCZ4[Theta -> 0] + } +}; + +(******************************************************************************) +(* Constraint equations *) +(******************************************************************************) + +constraintsCalc = +{ + Name -> thorn <> "_constraints", + Schedule -> Automatic, + After -> "MoL_PostStep", + Where -> Interior, + Shorthands -> {detgt, ddetgt[la], gtu[ua,ub], Z[ua], + Gt[ua,lb,lc], Gtl[la,lb,lc], Gtlu[la,lb,uc], Xtn[ua], + e4phi, em4phi, + g[la,lb], detg, gu[ua,ub], ddetg[la], G[ua,lb,lc], + Rt[la,lb], Rphi[la,lb], R[la,lb], trR, Atm[ua,lb], + gK[la,lb,lc], cdphi[la], cdphi2[la,lb], + rho, S[la], fac1, fac2}, + Equations -> + { + detgt -> 1 (* detgtExpr *), + ddetgt[la] -> 0 (* ddetgtExpr[la] *), + + (* This leads to simpler code... *) + gtu[ua,ub] -> 1/detgt detgtExpr MatrixInverse [gt[ua,ub]], + Gtl[la,lb,lc] -> 1/2 + (PD[gt[lb,la],lc] + PD[gt[lc,la],lb] - PD[gt[lb,lc],la]), + Gtlu[la,lb,uc] -> gtu[uc,ud] Gtl[la,lb,ld], + Gt[ua,lb,lc] -> gtu[ua,ud] Gtl[ld,lb,lc], + + (* The conformal connection functions calculated from the conformal metric, + used instead of Xt where no derivatives of Xt are taken *) + Xtn[ui] -> gtu[uj,uk] Gt[ui,lj,lk], + + e4phi -> IfThen[conformalMethod==CMW, 1/phi^2, Exp[4 phi]], + em4phi -> 1 / e4phi, + g[la,lb] -> e4phi gt[la,lb], + detg -> e4phi^3, + gu[ua,ub] -> em4phi gtu[ua,ub], + + (* The Z quantities *) + IfCCZ4[ + Z[ud] -> (1/2) gu[ua,ud] (- PD[gt[la,lb],lc] gtu[ub,uc] + gt[la,lc] Xt[uc]) + ], + + (* PRD 62, 044034 (2000), eqn. (18) *) + Rt[li,lj] -> - (1/2) gtu[ul,um] PD[gt[li,lj],ll,lm] + + (1/2) gt[lk,li] PD[Xt[uk],lj] + + (1/2) gt[lk,lj] PD[Xt[uk],li] + + (1/2) Xtn[uk] Gtl[li,lj,lk] + + (1/2) Xtn[uk] Gtl[lj,li,lk] + + (+ Gt[uk,li,ll] Gtlu[lj,lk,ul] + + Gt[uk,lj,ll] Gtlu[li,lk,ul] + + Gt[uk,li,ll] Gtlu[lk,lj,ul]), + + (* From the long turducken paper. + This expression seems to give the same result as the one from 044034. *) + (* TODO: symmetrise correctly: (ij) = (1/2) [i+j] *) +(* + Rt[li,lj] -> - (1/2) gtu[uk,ul] PD[gt[li,lj],lk,ll] + + gt[lk,li] PD[Xt[uk],lj] + gt[lk,lj] PD[Xt[uk],li] + + gt[li,ln] Gt[un,lj,lk] gtu[um,ua] gtu[uk,ub] PD[gt[la,lb],lm] + + gt[lj,ln] Gt[un,li,lk] gtu[um,ua] gtu[uk,ub] PD[gt[la,lb],lm] + + gtu[ul,us] (+ 2 Gt[uk,ll,li] gt[lj,ln] Gt[un,lk,ls] + + 2 Gt[uk,ll,lj] gt[li,ln] Gt[un,lk,ls] + + Gt[uk,li,ls] gt[lk,ln] Gt[un,ll,lj]), +*) + + (* Below would be a straightforward calculation, + without taking any Gamma^i into account. + This expression gives a different answer! *) +(* + Rt[la,lb] -> + Gt[u1,l2,la] Gt[l1,lb,u2] - Gt[u1,la,lb] Gt[l1,l2,u2] + + 1/2 gtu[u1,u2] (- PD[gt[l1,l2],la,lb] + PD[gt[l1,la],l2,lb] + - PD[gt[la,lb],l1,l2] + PD[gt[l2,lb],l1,la]), +*) + + fac1 -> IfThen[conformalMethod==CMW, -1/(2 phi), 1], + cdphi[la] -> fac1 CDt[phi,la], + fac2 -> IfThen[conformalMethod==CMW, 1/(2 phi^2), 0], + cdphi2[la,lb] -> fac1 CDt[phi,la,lb] + fac2 CDt[phi,la] CDt[phi,lb], + + (* PRD 62, 044034 (2000), eqn. (15) *) + Rphi[li,lj] -> - 2 cdphi2[lj,li] + - 2 gt[li,lj] gtu[ul,un] cdphi2[ll,ln] + + 4 cdphi[li] cdphi[lj] + - 4 gt[li,lj] gtu[ul,un] cdphi[ln] cdphi[ll], + + (* ddetg[la] -> PD[e4phi detg,la], *) + ddetg[la] -> e4phi ddetgt[la] + 4 detgt e4phi PD[phi,la], + (* TODO: check this equation, maybe simplify it by omitting ddetg *) + G[ua,lb,lc] -> Gt[ua,lb,lc] + + 1/(2 detg) (+ KD[ua,lb] ddetg[lc] + KD[ua,lc] ddetg[lb] + - (1/3) g[lb,lc] gu[ua,ud] ddetg[ld]), + + R[la,lb] -> + Rt[la,lb] + Rphi[la,lb], + + IfCCZ4[ + R[la,lb] -> R[la, lb] + (2/phi) (+ g[la,lc] Z[uc] PD[phi,lb] + + g[lb,lc] Z[uc] PD[phi,la] - g[la,lb] Z[uc] PD[phi,lc]) + + e4phi Z[uc] PD[gt[la,lb],lc] + ], + + trR -> gu[ua,ub] R[la,lb], + + (* K[la,lb] -> e4phi At[la,lb] + (1/3) g[la,lb] trK, *) + (* Km[ua,lb] -> gu[ua,uc] K[lc,lb], *) + Atm[ua,lb] -> gtu[ua,uc] At[lc,lb], + + (* Matter terms *) + + (* rho = n^a n^b T_ab *) + rho -> 1/alpha^2 (T00 - 2 beta[ui] T0[li] + beta[ui] beta[uj] T[li,lj]), + + (* S_i = -p^a_i n^b T_ab, where p^a_i = delta^a_i + n^a n_i *) + S[li] -> -1/alpha (T0[li] - beta[uj] T[li,lj]), + + (* Constraints *) + + (* H -> trR - Km[ua,lb] Km[ub,la] + trK^2, *) + (* PRD 67, 084023 (2003), eqn. (19) *) + H -> trR - Atm[ua,lb] Atm[ub,la] + (2/3) trK^2 - addMatter 16 Pi rho, + + (* gK[la,lb,lc] -> CD[K[la,lb],lc], *) +(* gK[la,lb,lc] -> + 4 e4phi PD[phi,lc] At[la,lb] + e4phi CD[At[la,lb],lc] + + (1/3) g[la,lb] PD[trK,lc], + + M[la] -> gu[ub,uc] (gK[lc,la,lb] - gK[lc,lb,la]), *) + + M[li] -> + gtu[uj,uk] (CDt[At[li,lj],lk] + 6 At[li,lj] cdphi[lk]) + - (2/3) PD[trK,li] + - addMatter 8 Pi S[li], + (* TODO: use PRD 67, 084023 (2003), eqn. (20) *) + + (* det gamma-tilde *) + cS -> Log[detgt], + + (* Gamma constraint *) + cXt[ua] -> gtu[ub,uc] Gt[ua,lb,lc] - Xt[ua], + + (* trace A-tilde *) + cA -> gtu[ua,ub] At[la,lb] + } +}; + +constraintsCalc1 = PartialCalculation[constraintsCalc, "1", + {}, + { + H + }]; + +constraintsCalc2 = PartialCalculation[constraintsCalc, "2", + {}, + { + M[li], + cS, + cXt[ua], + cA + }]; + +(******************************************************************************) +(* Implementations *) +(******************************************************************************) + +inheritedImplementations = + Join[{"ADMBase", "QuasiMaximalSlicing"}, + If [addMatter!=0, {"TmunuBase"}, {}]]; + +(******************************************************************************) +(* Parameters *) +(******************************************************************************) + +inheritedKeywordParameters = {}; + +extendedKeywordParameters = +{ + { + Name -> "ADMBase::evolution_method", + AllowedValues -> {thorn} + }, + { + Name -> "ADMBase::lapse_evolution_method", + AllowedValues -> {thorn} + }, + { + Name -> "ADMBase::shift_evolution_method", + AllowedValues -> {thorn} + }, + { + Name -> "ADMBase::dtlapse_evolution_method", + AllowedValues -> {thorn} + }, + { + Name -> "ADMBase::dtshift_evolution_method", + AllowedValues -> {thorn} + } +}; + +keywordParameters = +{ + { + Name -> "my_initial_data", + (* Visibility -> "restricted", *) + (* Description -> "ddd", *) + AllowedValues -> {"ADMBase", "Minkowski"}, + Default -> "ADMBase" + }, + { + Name -> "my_initial_boundary_condition", + Visibility -> "restricted", + (* Description -> "ddd", *) + AllowedValues -> {"none"}, + Default -> "none" + }, + { + Name -> "my_rhs_boundary_condition", + Visibility -> "restricted", + (* Description -> "ddd", *) + AllowedValues -> {"none", "static", "radiative"}, + Default -> "none" + }, + { + Name -> "my_boundary_condition", + (* Visibility -> "restricted", *) + (* Description -> "ddd", *) + AllowedValues -> {"none", "Minkowski"}, + Default -> "none" + }, + { + Name -> "calculate_ADMBase_variables_at", + Visibility -> "restricted", + (* Description -> "ddd", *) + AllowedValues -> {"MoL_PostStep", "CCTK_EVOL", "CCTK_ANALYSIS"}, + Default -> "MoL_PostStep" + }, + { + Name -> "UseSpatialBetaDriver", + Visibility -> "restricted", + (* Description -> "ddd", *) + AllowedValues -> {"no", "yes"}, + Default -> "no" + }, + { + Name -> "dt_lapse_shift_method", + Description -> "Treatment of ADMBase dtlapse and dtshift", + AllowedValues -> {"correct", + "noLapseShiftAdvection" (* omit lapse and shift advection terms (faster) *) + }, + Default -> "correct" + }, + { + Name -> "apply_dissipation", + Description -> "Whether to apply dissipation to the RHSs", + AllowedValues -> {"always", + "never" (* yes and no keyword values confuse Cactus, and Kranc + doesn't support boolean parameters *) + }, + Default -> "never" + } + +}; + +intParameters = +{ + { + Name -> harmonicN, + Description -> "d/dt alpha = - f alpha^n K (harmonic=2, 1+log=1)", + Default -> 2 + }, + { + Name -> ShiftAlphaPower, + Default -> 0 + }, + { + Name -> conformalMethod, + Description -> "Treatment of conformal factor", + AllowedValues -> {{Value -> "0", Description -> "phi method"}, + {Value -> "1", Description -> "W method"}}, + Default -> 0 + }, + { + Name -> fdOrder, + Default -> derivOrder, + AllowedValues -> {2,4,6,8} + }, + { + Name -> harmonicShift, + Description -> "Whether to use the harmonic shift", + AllowedValues -> {{Value -> "0", Description -> "Gamma driver shift"}, + {Value -> "1", Description -> "Harmonic shift"}}, + Default -> 0 + } +}; + +realParameters = +{ + IfCCZ4[{ + Name -> GammaShift, + Description -> "Covariant shift term in Gamma", + Default -> 0.5 + }], + IfCCZ4[{ + Name -> dampk1, + Description -> "CCZ4 damping term 1 for Theta and Z", + Default -> 0 + }], + IfCCZ4[{ + Name -> dampk2, + Description -> "CCZ4 damping term 2 for Theta and Z", + Default -> 0 + }], + { + Name -> LapseACoeff, + Description -> "Whether to evolve A in time", + Default -> 0 + }, + { + Name -> harmonicF, + Description -> "d/dt alpha = - f alpha^n K (harmonic=1, 1+log=2)", + Default -> 1 + }, + { + Name -> AlphaDriver, + Default -> 0 + }, + { + Name -> RDriver, + Default -> 1 + }, + { + Name -> ShiftBCoeff, + Description -> "Whether to evolve B^i in time", + Default -> 1 + }, + { + Name -> ShiftGammaCoeff, + Default -> 0 + }, + { + Name -> BetaDriver, + Default -> 0 + }, + { + Name -> WFactor, + Default -> 1 + }, + { + Name -> LapseAdvectionCoeff, + Description -> "Factor in front of the lapse advection terms in 1+log", + Default -> 1 + }, + { + Name -> ShiftAdvectionCoeff, + Description -> "Factor in front of the shift advection terms in gamma driver", + Default -> 1 + }, + { + Name -> MinimumLapse, + Description -> "Minimum value of the lapse function", + Default -> -1 + }, + { + Name -> SpatialBetaDriverRadius, + Description -> "Radius at which the BetaDriver starts to be reduced", + AllowedValues -> {{Value -> "(0:*", Description -> "Positive"}}, + Default -> 10^12 + }, + { + Name -> SpatialShiftGammaCoeffRadius, + Description -> "Radius at which the ShiftGammaCoefficient starts to be reduced", + AllowedValues -> {{Value -> "(0:*", Description -> "Positive"}}, + Default -> 10^12 + }, + { + Name -> EpsDiss, + Description -> "Dissipation strength", + AllowedValues -> {{Value -> "(0:*", Description -> "Positive"}}, + Default -> 0 + }, + { + Name -> KEvolFactor, + Description -> "", + AllowedValues -> {{Value -> "(0:*", Description -> "Positive"}}, + Default -> 0 + } +}; + +(******************************************************************************) +(* Construct the thorns *) +(******************************************************************************) + +calculations = +Join[ +{ + initialCalc, + convertFromADMBaseCalc, + initGammaCalc, + convertFromADMBaseGammaCalc, + evolCalc, + (*evolCalc1, evolCalc2,*) + lapseEvolCalc, + dissCalc, + advectCalc, + initRHSCalc, + (* evol1Calc, evol2Calc, *) + RHSStaticBoundaryCalc, + (* RHSRadiativeBoundaryCalc, *) + enforceCalc, + boundaryCalc, + convertToADMBaseCalc, + convertToADMBaseDtLapseShiftCalc, + convertToADMBaseDtLapseShiftBoundaryCalc, + convertToADMBaseFakeDtLapseShiftCalc, + constraintsCalc + (*constraintsCalc1, constraintsCalc2*) +}, + {} (*dissCalcs*) +]; + +CreateKrancThornTT [groups, ".", thorn, + Calculations -> calculations, + DeclaredGroups -> declaredGroupNames, + PartialDerivatives -> derivatives, + EvolutionTimelevels -> evolutionTimelevels, + DefaultEvolutionTimelevels -> 3, + UseJacobian -> True, + UseLoopControl -> True, + UseVectors -> True, + InheritedImplementations -> inheritedImplementations, + InheritedKeywordParameters -> inheritedKeywordParameters, + ExtendedKeywordParameters -> extendedKeywordParameters, + KeywordParameters -> keywordParameters, + IntParameters -> intParameters, + RealParameters -> realParameters +]; + +]; + + + +(******************************************************************************) +(* Options *) +(******************************************************************************) + +(* These are the arguments to createCode: + - derivative order: 2, 4, 6, 8, ... + - useJacobian: False or True + - split upwind derivatives: False or True + - timelevels: 2 or 3 + (keep this at 3; this is better chosen with a run-time parameter) + - matter: 0 or 1 + (matter seems cheap; it should be always enabled) + - thorn base name +*) + +createCode[4, False, True , 3, 1, "MD"]; diff --git a/src/basis.c b/src/basis.c new file mode 100644 index 0000000..8e5bdcc --- /dev/null +++ b/src/basis.c @@ -0,0 +1,281 @@ +/* + * Basis sets for pseudospectral methods + * Copyright (C) 2016 Anton Khirnov <anton@khirnov.net> + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#include <errno.h> +#include <math.h> + +#include "basis.h" +#include "common.h" + +typedef struct BasisSet { + /* evaluate the idx-th basis function at the specified point*/ + double (*eval) (const MDBasisSetContext *s, double coord, unsigned int idx); + /* evaluate the first derivative of the idx-th basis function at the specified point*/ + double (*eval_diff1)(const MDBasisSetContext *s, double coord, unsigned int idx); + /* evaluate the second derivative of the idx-th basis function at the specified point*/ + double (*eval_diff2)(const MDBasisSetContext *s, double coord, unsigned int idx); + /** + * Get the idx-th collocation point for the specified order. + * idx runs from 0 to order - 1 (inclusive) + */ + double (*colloc_point)(const MDBasisSetContext *s, unsigned int order, unsigned int idx); +} BasisSet; + +struct MDBasisSetContext { + const BasisSet *bs; + double sf; +}; + +/* + * The basis of even (n = 2 * idx) SB functions (Boyd 2000, Ch 17.9) + * SB(x, n) = sin((n + 1) arccot(|x| / L)) + * They are symmetric wrt origin and decay as 1/x in infinity. + */ +static double sb_even_eval(const MDBasisSetContext *s, double coord, unsigned int idx) +{ + double val = atan2(s->sf, coord); + + idx *= 2; // even only + + return sin((idx + 1) * val); +} + +static double sb_even_eval_diff1(const MDBasisSetContext *s, double coord, unsigned int idx) +{ + double val = atan2(s->sf, coord); + + idx *= 2; // even only + + return -s->sf * (idx + 1) * cos((idx + 1) * val) / (SQR(s->sf) + SQR(coord)); +} + +static double sb_even_eval_diff2(const MDBasisSetContext *s, double coord, unsigned int idx) +{ + const double sf = s->sf; + double val = atan2(sf, coord); + + idx *= 2; // even only + + return sf * (idx + 1) * (2 * coord * cos((idx + 1) * val) - sf * (idx + 1) * sin((idx + 1) * val)) / SQR(SQR(sf) + SQR(coord)); +} + +static double sb_even_colloc_point(const MDBasisSetContext *s, unsigned int order, unsigned int idx) +{ + double t; + + idx = order - idx - 1; + //order *= 2; + + //t = (idx + 2) * M_PI / (order + 4); +#if MD_POLAR + t = (idx + 2) * M_PI / (2 * order + 3); +#else + t = (idx + 2) * M_PI / (2 * order + 2); +#endif + return s->sf / tan(t); +} + +static const BasisSet sb_even_basis = { + .eval = sb_even_eval, + .eval_diff1 = sb_even_eval_diff1, + .eval_diff2 = sb_even_eval_diff2, + .colloc_point = sb_even_colloc_point, +}; + +static double sb_odd_eval(const MDBasisSetContext *s, double coord, unsigned int idx) +{ + double val = atan2(s->sf, coord); + + idx = 2 * idx + 2; // odd only + + return sin((idx) * val); +} + +static double sb_odd_eval_diff1(const MDBasisSetContext *s, double coord, unsigned int idx) +{ + double val = atan2(s->sf, coord); + + idx = 2 * idx + 2; // odd only + + return -s->sf * (idx) * cos((idx) * val) / (SQR(s->sf) + SQR(coord)); +} + +static double sb_odd_eval_diff2(const MDBasisSetContext *s, double coord, unsigned int idx) +{ + const double sf = s->sf; + double val = atan2(sf, coord); + + idx = 2 * idx + 2; // odd only + + return sf * (idx) * (2 * coord * cos((idx) * val) - sf * (idx) * sin((idx) * val)) / SQR(SQR(sf) + SQR(coord)); +} + +static double sb_odd_colloc_point(const MDBasisSetContext *s, unsigned int order, unsigned int idx) +{ + double t; + + idx = order - idx - 1; + //order *= 2; + + //t = (idx + 2) * M_PI / (order + 4); +#if MD_POLAR + t = (idx + 2) * M_PI / (2 * order + 3); +#else + t = (idx + 2) * M_PI / (2 * order + 3); +#endif + return s->sf / tan(t); +} + +static const BasisSet sb_odd_basis = { + .eval = sb_odd_eval, + .eval_diff1 = sb_odd_eval_diff1, + .eval_diff2 = sb_odd_eval_diff2, + .colloc_point = sb_odd_colloc_point, +}; + +static double tb_even_eval(const MDBasisSetContext *s, double coord, unsigned int idx) +{ + double val = (coord == 0.0) ? M_PI_2 : atan(s->sf / fabs(coord)); + + idx++; + idx *= 2; // even only + + return cos(idx * val) - 1.0; +} + +static double tb_even_eval_diff1(const MDBasisSetContext *s, double coord, unsigned int idx) +{ + double val = (coord == 0.0) ? M_PI_2 : atan(s->sf / fabs(coord)); + + idx++; + idx *= 2; // even only + + return s->sf * idx * SGN(coord) * sin(idx * val) / (SQR(s->sf) + SQR(coord)); +} + +static double tb_even_eval_diff2(const MDBasisSetContext *s, double coord, unsigned int idx) +{ + const double sf = s->sf; + double val = (coord == 0.0) ? M_PI_2 : atan(sf / fabs(coord)); + + idx++; + idx *= 2; // even only + + return -sf * idx * SGN(coord) * (2 * fabs(coord) * sin(idx * val) + sf * idx * cos(idx * val)) / SQR(SQR(sf) + SQR(coord)); +} + +static double tb_even_colloc_point(const MDBasisSetContext *s, unsigned int order, unsigned int idx) +{ + double t; + + idx = order - idx - 1; + //order *= 2; + + //t = (idx + 2) * M_PI / (order + 4); + t = (idx + 2) * M_PI / (2 * order + 4); + return s->sf / tan(t); +} + +static const BasisSet tb_even_basis = { + .eval = tb_even_eval, + .eval_diff1 = tb_even_eval_diff1, + .eval_diff2 = tb_even_eval_diff2, + .colloc_point = tb_even_colloc_point, +}; + +static double cos_even_eval(const MDBasisSetContext *s, double coord, unsigned int idx) +{ + return cos(2 * idx * coord); +} + +static double cos_even_eval_diff1(const MDBasisSetContext *s, double coord, unsigned int idx) +{ + return -2 * idx * sin(2 * idx * coord); +} + +static double cos_even_eval_diff2(const MDBasisSetContext *s, double coord, unsigned int idx) +{ + return -4 * SQR(idx) * cos(2 * idx * coord); +} + +static double cos_even_colloc_point(const MDBasisSetContext *s, unsigned int order, unsigned int idx) +{ + return M_PI * idx / (2 * order - 0); +} + +static const BasisSet cos_even_basis = { + .eval = cos_even_eval, + .eval_diff1 = cos_even_eval_diff1, + .eval_diff2 = cos_even_eval_diff2, + .colloc_point = cos_even_colloc_point, +}; + +double md_basis_eval(const MDBasisSetContext *s, enum MDBasisEvalType type, + double coord, unsigned int order) +{ + double (*eval)(const MDBasisSetContext *, double, unsigned int) = NULL; + + switch (type) { + case MD_BASIS_EVAL_TYPE_VALUE: eval = s->bs->eval; break; + case MD_BASIS_EVAL_TYPE_DIFF1: eval = s->bs->eval_diff1; break; + case MD_BASIS_EVAL_TYPE_DIFF2: eval = s->bs->eval_diff2; break; + } + + return eval(s, coord, order); +} + +double md_basis_colloc_point(const MDBasisSetContext *s, unsigned int order, + unsigned int idx) +{ + return s->bs->colloc_point(s, order, idx); +} + +void md_basis_free(MDBasisSetContext **pctx) +{ + MDBasisSetContext *ctx = *pctx; + + if (!ctx) + return; + + free(ctx); + *pctx = NULL; +} + +int md_basis_init(MDBasisSetContext **pctx, enum MDBasisFamily family, double sf) +{ + MDBasisSetContext *ctx; + + ctx = calloc(1, sizeof(*ctx)); + if (!ctx) + return -ENOMEM; + + switch (family) { + case MD_BASIS_FAMILY_TB_EVEN: ctx->bs = &tb_even_basis; break; + case MD_BASIS_FAMILY_SB_EVEN: ctx->bs = &sb_even_basis; break; + case MD_BASIS_FAMILY_SB_ODD: ctx->bs = &sb_odd_basis; break; + case MD_BASIS_FAMILY_COS_EVEN: ctx->bs = &cos_even_basis; break; + default: + free(ctx); + return -EINVAL; + } + + ctx->sf = sf; + + *pctx = ctx; + return 0; +} diff --git a/src/basis.h b/src/basis.h new file mode 100644 index 0000000..08f23ee --- /dev/null +++ b/src/basis.h @@ -0,0 +1,45 @@ +/* + * Basis sets for pseudospectral methods + * Copyright (C) 2016 Anton Khirnov <anton@khirnov.net> + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef MD_BASIS_H +#define MD_BASIS_H + +enum MDBasisEvalType { + MD_BASIS_EVAL_TYPE_VALUE, + MD_BASIS_EVAL_TYPE_DIFF1, + MD_BASIS_EVAL_TYPE_DIFF2, +}; + +enum MDBasisFamily { + MD_BASIS_FAMILY_TB_EVEN, + MD_BASIS_FAMILY_SB_EVEN, + MD_BASIS_FAMILY_SB_ODD, + MD_BASIS_FAMILY_COS_EVEN, +}; + +typedef struct MDBasisSetContext MDBasisSetContext; + +int md_basis_init(MDBasisSetContext **ctx, enum MDBasisFamily family, double sf); +void md_basis_free(MDBasisSetContext **ctx); + +double md_basis_eval(const MDBasisSetContext *ctx, enum MDBasisEvalType type, + double coord, unsigned int order); +double md_basis_colloc_point(const MDBasisSetContext *ctx, unsigned int order, + unsigned int idx); + +#endif /* MD_BASIS_H */ diff --git a/src/bicgstab.c b/src/bicgstab.c new file mode 100644 index 0000000..7e82183 --- /dev/null +++ b/src/bicgstab.c @@ -0,0 +1,410 @@ +/* + * BiCGStab iterative linear system solver + * Copyright (C) 2016 Anton Khirnov <anton@khirnov.net> + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#include "common.h" + +#if HAVE_OPENCL +#include <cl.h> +#include <clBLAS.h> +#endif + +#include <cblas.h> +#include <errno.h> +#include <stdlib.h> +#include <string.h> + +#include "bicgstab.h" + +#define BICGSTAB_MAXITER 16 +#define BICGSTAB_TOL (1e-15) + +struct BiCGStabContext { + int N; + + double *x; + double *p, *v, *y, *z, *t; + double *res, *res0; + double *k; + +#if HAVE_OPENCL + cl_context ocl_ctx; + cl_command_queue ocl_queue; + + cl_mem cl_x; + cl_mem cl_p, cl_v, cl_y, cl_z, cl_t; + cl_mem cl_res, cl_res0; + cl_mem cl_k, cl_mat; + cl_mem cl_rho, cl_alpha, cl_beta, cl_omega, cl_omega1; + cl_mem cl_tmp, cl_tmp1; +#endif +}; + +#if HAVE_OPENCL +static int solve_cl(BiCGStabContext *ctx, + const double *mat, const double *rhs, double *x) +{ + cl_command_queue ocl_q = ctx->ocl_queue; + const int N = ctx->N; + const double rhs_norm = cblas_dnrm2(N, rhs, 1); + + double rho, rho_prev = 1.0; + double omega[2] = { 1.0 }; + double alpha = 1.0; + + double err; + int i; + + cl_event events[8]; + + // upload the matrix and RHS + clEnqueueWriteBuffer(ocl_q, ctx->cl_res, 0, 0, N * sizeof(double), rhs, 0, NULL, &events[0]); + clEnqueueWriteBuffer(ocl_q, ctx->cl_mat, 0, 0, N * N * sizeof(double), mat, 0, NULL, &events[1]); + + // initialize the residual + clblasDgemv(CblasColMajor, CblasNoTrans, N, N, -1.0, + ctx->cl_mat, 0, N, ctx->cl_x, 0, 1, 1.0, ctx->cl_res, 0, 1, + 1, &ocl_q, 2, events, &events[2]); + clEnqueueCopyBuffer(ocl_q, ctx->cl_res, ctx->cl_res0, 0, 0, N * sizeof(double), + 1, &events[2], &events[3]); + clEnqueueCopyBuffer(ocl_q, ctx->cl_res, ctx->cl_p, 0, 0, N * sizeof(double), + 1, &events[2], &events[4]); + + clWaitForEvents(5, events); + // BARRIER + + for (i = 0; i < MAXITER; i++) { + clblasDdot(N, ctx->cl_rho, 0, ctx->cl_res, 0, 1, ctx->cl_res0, 0, 1, + ctx->cl_tmp, 1, &ocl_q, 0, NULL, &events[0]); + clEnqueueReadBuffer(ocl_q, ctx->cl_rho, 1, 0, sizeof(double), &rho, + 1, &events[0], NULL); + // BARRIER + + if (i) { + double beta = (rho / rho_prev) * (alpha / omega[0]); + + clblasDaxpy(N, -omega[0], ctx->cl_v, 0, 1, ctx->cl_p, 0, 1, + 1, &ocl_q, 0, NULL, &events[0]); + clblasDscal(N, beta, ctx->cl_p, 0, 1, + 1, &ocl_q, 1, &events[0], &events[1]); + clblasDaxpy(N, 1, ctx->cl_res, 0, 1, ctx->cl_p, 0, 1, + 1, &ocl_q, 1, &events[1], &events[0]); + clWaitForEvents(1, &events[0]); + // BARRIER + } + + clblasDgemv(CblasColMajor, CblasNoTrans, N, N, 1.0, + ctx->cl_k, 0, N, ctx->cl_p, 0, 1, 0.0, ctx->cl_y, 0, 1, + 1, &ocl_q, 0, NULL, &events[0]); + + clblasDgemv(CblasColMajor, CblasNoTrans, N, N, 1.0, + ctx->cl_mat, 0, N, ctx->cl_y, 0, 1, 0.0, ctx->cl_v, 0, 1, + 1, &ocl_q, 1, &events[0], &events[1]); + + clblasDdot(N, ctx->cl_alpha, 0, ctx->cl_res0, 0, 1, ctx->cl_v, 0, 1, + ctx->cl_tmp, 1, &ocl_q, 1, &events[1], &events[0]); + clEnqueueReadBuffer(ocl_q, ctx->cl_alpha, 1, 0, sizeof(double), &alpha, + 1, &events[0], NULL); + // BARRIER + + alpha = rho / alpha; + + clblasDaxpy(N, -alpha, ctx->cl_v, 0, 1, ctx->cl_res, 0, 1, + 1, &ocl_q, 0, NULL, &events[0]); + + clblasDgemv(CblasColMajor, CblasNoTrans, N, N, 1.0, + ctx->cl_k, 0, N, ctx->cl_res, 0, 1, 0.0, ctx->cl_z, 0, 1, + 1, &ocl_q, 1, &events[0], &events[1]); + clblasDgemv(CblasColMajor, CblasNoTrans, N, N, 1.0, + ctx->cl_mat, 0, N, ctx->cl_z, 0, 1, 0.0, ctx->cl_t, 0, 1, + 1, &ocl_q, 1, &events[1], &events[0]); + + clblasDdot(N, ctx->cl_omega, 0, ctx->cl_t, 0, 1, ctx->cl_res, 0, 1, + ctx->cl_tmp, 1, &ocl_q, 1, &events[0], &events[1]); + clblasDdot(N, ctx->cl_omega, 1, ctx->cl_t, 0, 1, ctx->cl_t, 0, 1, + ctx->cl_tmp1, 1, &ocl_q, 1, &events[0], &events[2]); + + clEnqueueReadBuffer(ocl_q, ctx->cl_omega, 1, 0, sizeof(omega), omega, + 2, &events[1], NULL); + // BARRIER + + omega[0] /= omega[1]; + + clblasDaxpy(N, alpha, ctx->cl_y, 0, 1, ctx->cl_x, 0, 1, + 1, &ocl_q, 0, NULL, &events[0]); + clblasDaxpy(N, omega[0], ctx->cl_z, 0, 1, ctx->cl_x, 0, 1, + 1, &ocl_q, 1, &events[0], &events[1]); + + clblasDaxpy(N, -omega[0], ctx->cl_t, 0, 1, ctx->cl_res, 0, 1, + 1, &ocl_q, 0, NULL, &events[0]); + clblasDnrm2(N, ctx->cl_tmp, 0, ctx->cl_res, 0, 1, ctx->cl_tmp1, + 1, &ocl_q, 1, &events[0], &events[2]); + clEnqueueReadBuffer(ocl_q, ctx->cl_tmp, 1, 0, sizeof(double), &err, + 1, &events[2], NULL); + clWaitForEvents(1, &events[1]); + // BARRIER + + if (err < BICGSTAB_TOL) + break; + + rho_prev = rho; + } + if (i == BICGSTAB_MAXITER) + return -1; + + clEnqueueReadBuffer(ocl_q, ctx->cl_x, 1, 0, sizeof(double) * N, + x, 0, NULL, NULL); + return i; +} +#endif + +// based on the wikipedia article +// and http://www.netlib.org/templates/matlab/bicgstab.m +static int solve_sw(BiCGStabContext *ctx, + const double *mat, const double *rhs, double *x) +{ + const int N = ctx->N; + const double rhs_norm = cblas_dnrm2(N, rhs, 1); + + double rho, rho_prev = 1.0; + double omega = 1.0; + double alpha = 1.0; + + double err; + int i; + + double *k = ctx->k; + double *p = ctx->p, *v = ctx->v, *y = ctx->y, *z = ctx->z, *t = ctx->t; + double *res = ctx->res, *res0 = ctx->res0; + + // initialize the residual + memcpy(res, rhs, N * sizeof(*res)); + cblas_dgemv(CblasColMajor, CblasNoTrans, N, N, -1.0, + mat, N, ctx->x, 1, 1.0, res, 1); + + memcpy(res0, res, N * sizeof(*res0)); + memcpy(p, res, N * sizeof(*p)); + + for (i = 0; i < BICGSTAB_MAXITER; i++) { + rho = cblas_ddot(N, res, 1, res0, 1); + + if (i) { + double beta = (rho / rho_prev) * (alpha / omega); + + cblas_daxpy(N, -omega, v, 1, p, 1); + cblas_dscal(N, beta, p, 1); + cblas_daxpy(N, 1, res, 1, p, 1); + } + + cblas_dgemv(CblasColMajor, CblasNoTrans, N, N, 1.0, + k, N, p, 1, 0.0, y, 1); + + cblas_dgemv(CblasColMajor, CblasNoTrans, N, N, 1.0, + mat, N, y, 1, 0.0, v, 1); + + alpha = rho / cblas_ddot(N, res0, 1, v, 1); + + cblas_daxpy(N, -alpha, v, 1, res, 1); + + cblas_dgemv(CblasColMajor, CblasNoTrans, N, N, 1.0, + k, N, res, 1, 0.0, z, 1); + cblas_dgemv(CblasColMajor, CblasNoTrans, N, N, 1.0, + mat, N, z, 1, 0.0, t, 1); + + omega = cblas_ddot(N, t, 1, res, 1) / cblas_ddot(N, t, 1, t, 1); + + cblas_daxpy(N, alpha, y, 1, ctx->x, 1); + cblas_daxpy(N, omega, z, 1, ctx->x, 1); + + cblas_daxpy(N, -omega, t, 1, res, 1); + + err = cblas_dnrm2(N, res, 1) / rhs_norm; + if (err < BICGSTAB_TOL) + break; + + rho_prev = rho; + } + if (i == BICGSTAB_MAXITER) + return -1; + + memcpy(x, ctx->x, sizeof(*x) * ctx->N); + + return i; +} + +int md_bicgstab_solve(BiCGStabContext *ctx, const double *mat, const double *rhs, double *x) +{ + int ret; + +#if HAVE_OPENCL + if (ctx->ocl_ctx) + ret = solve_cl(ctx, mat, rhs, x); + else +#endif + ret = solve_sw(ctx, mat, rhs, x); + if (ret < 0) + return ret; + +#if MD_VERIFY + { + int i; + double *y; + + y = malloc(sizeof(*y) * ctx->N); + memcpy(y, rhs, sizeof(*y) * ctx->N); + cblas_dgemv(CblasColMajor, CblasNoTrans, ctx->N, ctx->N, -1.0, + mat, ctx->N, x, 1, 1.0, y, 1); + i = cblas_idamax(ctx->N, y, 1); + if (fabs(y[i]) > 1e-11) + abort(); + } +#endif + + return ret; +} + +int md_bicgstab_init(BiCGStabContext *ctx, const double *k, const double *x0) +{ +#if HAVE_OPENCL + if (ctx->ocl_ctx) { + cl_event events[2]; + clEnqueueWriteBuffer(ctx->ocl_queue, ctx->cl_k, 0, 0, ctx->N * ctx->N * sizeof(double), + k, 0, NULL, &events[0]); + clEnqueueWriteBuffer(ctx->ocl_queue, ctx->cl_x, 0, 0, ctx->N * sizeof(double), + x0, 0, NULL, &events[1]); + clWaitForEvents(2, events); + } else +#endif + { + memcpy(ctx->x, x0, ctx->N * sizeof(*x0)); + memcpy(ctx->k, k, ctx->N * ctx->N * sizeof(*k)); + } + + return 0; +} + +int md_bicgstab_context_alloc(BiCGStabContext **pctx, int N, + cl_context ocl_ctx, cl_command_queue ocl_q) +{ + BiCGStabContext *ctx; + int ret = 0; + + ctx = calloc(1, sizeof(*ctx)); + if (!ctx) + return -ENOMEM; + + ctx->N = N; + +#if HAVE_OPENCL + if (ocl_ctx) { + ctx->ocl_ctx = ocl_ctx; + ctx->ocl_queue = ocl_q; + +#define ALLOC(dst, size) \ +do { \ + ctx->dst = clCreateBuffer(ocl_ctx, 0, size, NULL, &ret); \ + if (ret != CL_SUCCESS) \ + goto fail; \ +} while (0) + + ALLOC(cl_x, N * sizeof(double)); + ALLOC(cl_p, N * sizeof(double)); + ALLOC(cl_v, N * sizeof(double)); + ALLOC(cl_y, N * sizeof(double)); + ALLOC(cl_z, N * sizeof(double)); + ALLOC(cl_t, N * sizeof(double)); + ALLOC(cl_res, N * sizeof(double)); + ALLOC(cl_res0, N * sizeof(double)); + ALLOC(cl_tmp, N * sizeof(double)); + ALLOC(cl_tmp1, N * 2 * sizeof(double)); + + ALLOC(cl_k, N * N * sizeof(double)); + ALLOC(cl_mat, N * N * sizeof(double)); + + ALLOC(cl_rho, sizeof(double)); + ALLOC(cl_alpha, sizeof(double)); + ALLOC(cl_beta, sizeof(double)); + ALLOC(cl_omega, 2 * sizeof(double)); + ALLOC(cl_omega1, sizeof(double)); + } else +#endif + { + ret |= posix_memalign((void**)&ctx->x, 32, sizeof(double) * N); + ret |= posix_memalign((void**)&ctx->p, 32, sizeof(double) * N); + ret |= posix_memalign((void**)&ctx->v, 32, sizeof(double) * N); + ret |= posix_memalign((void**)&ctx->y, 32, sizeof(double) * N); + ret |= posix_memalign((void**)&ctx->z, 32, sizeof(double) * N); + ret |= posix_memalign((void**)&ctx->t, 32, sizeof(double) * N); + ret |= posix_memalign((void**)&ctx->res, 32, sizeof(double) * N); + ret |= posix_memalign((void**)&ctx->res0, 32, sizeof(double) * N); + ret |= posix_memalign((void**)&ctx->k, 32, sizeof(double) * N * N); + } + +fail: + if (ret) { + md_bicgstab_context_free(&ctx); + return -ENOMEM; + } + + *pctx = ctx; + return 0; +} + +void md_bicgstab_context_free(BiCGStabContext **pctx) +{ + BiCGStabContext *ctx = *pctx; + + if (!ctx) + return; + + free(ctx->x); + free(ctx->p); + free(ctx->v); + free(ctx->y); + free(ctx->z); + free(ctx->t); + free(ctx->res); + free(ctx->res0); + free(ctx->k); + +#if HAVE_OPENCL + if (ctx->ocl_ctx) { + clReleaseMemObject(ctx->cl_x); + clReleaseMemObject(ctx->cl_p); + clReleaseMemObject(ctx->cl_v); + clReleaseMemObject(ctx->cl_y); + clReleaseMemObject(ctx->cl_z); + clReleaseMemObject(ctx->cl_t); + clReleaseMemObject(ctx->cl_res); + clReleaseMemObject(ctx->cl_res0); + clReleaseMemObject(ctx->cl_tmp); + clReleaseMemObject(ctx->cl_tmp1); + + clReleaseMemObject(ctx->cl_k); + clReleaseMemObject(ctx->cl_mat); + + clReleaseMemObject(ctx->cl_rho); + clReleaseMemObject(ctx->cl_alpha); + clReleaseMemObject(ctx->cl_beta); + clReleaseMemObject(ctx->cl_omega); + clReleaseMemObject(ctx->cl_omega1); + } +#endif + + free(ctx); + *pctx = NULL; +} diff --git a/src/bicgstab.h b/src/bicgstab.h new file mode 100644 index 0000000..70624f4 --- /dev/null +++ b/src/bicgstab.h @@ -0,0 +1,60 @@ +/* + * BiCGStab iterative linear system solver + * Copyright (C) 2016 Anton Khirnov <anton@khirnov.net> + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef MD_BICGSTAB_H +#define MD_BICGSTAB_H + +#include "common.h" + +#if HAVE_OPENCL +#include <cl.h> +#else +typedef void* cl_context; +typedef void* cl_command_queue; +#endif + +typedef struct BiCGStabContext BiCGStabContext; + +/** + * Allocate and initialize the solver for the NxN system. + * + * If the OpenCL context and command queue are provided (non-NULL), the solver + * will run using clBLAS. + */ +int md_bicgstab_context_alloc(BiCGStabContext **ctx, int N, + cl_context ocl_ctx, cl_command_queue ocl_q); + +/** + * Free the solver and all its internal state. + */ +void md_bicgstab_context_free(BiCGStabContext **ctx); + +/** + * Initialise the solver with the given preconditioner matrix. This function + * may be any number of times on a given solver context. + */ +int md_bicgstab_init(BiCGStabContext *ctx, const double *k, const double *x0); + +/** + * Solve the linear system + * mat · x = rhs + * The result is written into x. + */ +int md_bicgstab_solve(BiCGStabContext *ctx, const double *mat, const double *rhs, double *x); + +#endif /* MD_BICGSTAB_H */ diff --git a/src/common.h b/src/common.h new file mode 100644 index 0000000..2b1ebf6 --- /dev/null +++ b/src/common.h @@ -0,0 +1,29 @@ +#ifndef MD_COMMON_H +#define MD_COMMON_H + +#define HAVE_OPENCL 0 +#define MD_VERIFY 0 +#define MD_POLAR 0 + +#define SQR(x) ((x) * (x)) +#define SGN(x) ((x) >= 0.0 ? 1.0 : -1.0) +#define MAX(x, y) ((x) > (y) ? (x) : (y)) +#define MIN(x, y) ((x) > (y) ? (y) : (x)) +#define ARRAY_ELEMS(arr) (sizeof(arr) / sizeof(*arr)) + +/* + * small number to avoid r=0 singularities + */ +#define EPS 1E-08 + +#include <stdlib.h> +#include <stdint.h> +#include <sys/time.h> +static inline int64_t gettime(void) +{ + struct timeval tv; + gettimeofday(&tv, NULL); + return (int64_t)tv.tv_sec * 1000000 + tv.tv_usec; +} + +#endif /* MD_COMMON_H */ diff --git a/src/config.asm b/src/config.asm new file mode 100644 index 0000000..0ee0ca2 --- /dev/null +++ b/src/config.asm @@ -0,0 +1,1325 @@ +%define ARCH_AARCH64 0 +%define ARCH_ALPHA 0 +%define ARCH_ARM 0 +%define ARCH_AVR32 0 +%define ARCH_AVR32_AP 0 +%define ARCH_AVR32_UC 0 +%define ARCH_BFIN 0 +%define ARCH_IA64 0 +%define ARCH_M68K 0 +%define ARCH_MIPS 0 +%define ARCH_MIPS64 0 +%define ARCH_PARISC 0 +%define ARCH_PPC 0 +%define ARCH_PPC64 0 +%define ARCH_S390 0 +%define ARCH_SH4 0 +%define ARCH_SPARC 0 +%define ARCH_SPARC64 0 +%define ARCH_TILEGX 0 +%define ARCH_TILEPRO 0 +%define ARCH_TOMI 0 +%define ARCH_X86 1 +%define ARCH_X86_32 0 +%define ARCH_X86_64 1 +%define HAVE_ARMV5TE 0 +%define HAVE_ARMV6 0 +%define HAVE_ARMV6T2 0 +%define HAVE_ARMV8 0 +%define HAVE_NEON 0 +%define HAVE_VFP 0 +%define HAVE_VFPV3 0 +%define HAVE_ALTIVEC 0 +%define HAVE_DCBZL 1 +%define HAVE_LDBRX 1 +%define HAVE_PPC4XX 0 +%define HAVE_AMD3DNOW 1 +%define HAVE_AMD3DNOWEXT 1 +%define HAVE_AVX 1 +%define HAVE_AVX2 1 +%define HAVE_FMA3 1 +%define HAVE_FMA4 1 +%define HAVE_MMX 1 +%define HAVE_MMXEXT 1 +%define HAVE_SSE 1 +%define HAVE_SSE2 1 +%define HAVE_SSE3 1 +%define HAVE_SSE4 1 +%define HAVE_SSE42 1 +%define HAVE_SSSE3 1 +%define HAVE_XOP 1 +%define HAVE_CPUNOP 1 +%define HAVE_I686 1 +%define HAVE_LOONGSON 1 +%define HAVE_VIS 1 +%define HAVE_ARMV5TE_EXTERNAL 0 +%define HAVE_ARMV6_EXTERNAL 0 +%define HAVE_ARMV6T2_EXTERNAL 0 +%define HAVE_ARMV8_EXTERNAL 0 +%define HAVE_NEON_EXTERNAL 0 +%define HAVE_VFP_EXTERNAL 0 +%define HAVE_VFPV3_EXTERNAL 0 +%define HAVE_ALTIVEC_EXTERNAL 0 +%define HAVE_DCBZL_EXTERNAL 0 +%define HAVE_LDBRX_EXTERNAL 0 +%define HAVE_PPC4XX_EXTERNAL 0 +%define HAVE_AMD3DNOW_EXTERNAL 1 +%define HAVE_AMD3DNOWEXT_EXTERNAL 1 +%define HAVE_AVX_EXTERNAL 1 +%define HAVE_AVX2_EXTERNAL 1 +%define HAVE_FMA3_EXTERNAL 1 +%define HAVE_FMA4_EXTERNAL 1 +%define HAVE_MMX_EXTERNAL 1 +%define HAVE_MMXEXT_EXTERNAL 1 +%define HAVE_SSE_EXTERNAL 1 +%define HAVE_SSE2_EXTERNAL 1 +%define HAVE_SSE3_EXTERNAL 1 +%define HAVE_SSE4_EXTERNAL 1 +%define HAVE_SSE42_EXTERNAL 1 +%define HAVE_SSSE3_EXTERNAL 1 +%define HAVE_XOP_EXTERNAL 1 +%define HAVE_CPUNOP_EXTERNAL 0 +%define HAVE_I686_EXTERNAL 0 +%define HAVE_LOONGSON_EXTERNAL 0 +%define HAVE_VIS_EXTERNAL 0 +%define HAVE_ARMV5TE_INLINE 0 +%define HAVE_ARMV6_INLINE 0 +%define HAVE_ARMV6T2_INLINE 0 +%define HAVE_ARMV8_INLINE 0 +%define HAVE_NEON_INLINE 0 +%define HAVE_VFP_INLINE 0 +%define HAVE_VFPV3_INLINE 0 +%define HAVE_ALTIVEC_INLINE 0 +%define HAVE_DCBZL_INLINE 0 +%define HAVE_LDBRX_INLINE 0 +%define HAVE_PPC4XX_INLINE 0 +%define HAVE_AMD3DNOW_INLINE 1 +%define HAVE_AMD3DNOWEXT_INLINE 1 +%define HAVE_AVX_INLINE 1 +%define HAVE_AVX2_INLINE 1 +%define HAVE_FMA3_INLINE 1 +%define HAVE_FMA4_INLINE 1 +%define HAVE_MMX_INLINE 1 +%define HAVE_MMXEXT_INLINE 1 +%define HAVE_SSE_INLINE 1 +%define HAVE_SSE2_INLINE 1 +%define HAVE_SSE3_INLINE 1 +%define HAVE_SSE4_INLINE 1 +%define HAVE_SSE42_INLINE 1 +%define HAVE_SSSE3_INLINE 1 +%define HAVE_XOP_INLINE 1 +%define HAVE_CPUNOP_INLINE 0 +%define HAVE_I686_INLINE 0 +%define HAVE_LOONGSON_INLINE 0 +%define HAVE_VIS_INLINE 0 +%define HAVE_ALIGNED_STACK 1 +%define HAVE_FAST_64BIT 1 +%define HAVE_FAST_CLZ 1 +%define HAVE_FAST_CMOV 1 +%define HAVE_LOCAL_ALIGNED_8 1 +%define HAVE_LOCAL_ALIGNED_16 1 +%define HAVE_SIMD_ALIGN_16 1 +%define HAVE_ATOMICS_GCC 1 +%define HAVE_ATOMICS_SUNCC 0 +%define HAVE_ATOMICS_WIN32 0 +%define HAVE_ATOMIC_CAS_PTR 0 +%define HAVE_MACHINE_RW_BARRIER 0 +%define HAVE_MEMORYBARRIER 0 +%define HAVE_MM_EMPTY 1 +%define HAVE_RDTSC 0 +%define HAVE_SYNC_VAL_COMPARE_AND_SWAP 1 +%define HAVE_INLINE_ASM 1 +%define HAVE_SYMVER 1 +%define HAVE_YASM 1 +%define HAVE_BIGENDIAN 0 +%define HAVE_FAST_UNALIGNED 1 +%define HAVE_ALSA_ASOUNDLIB_H 1 +%define HAVE_ALTIVEC_H 0 +%define HAVE_ARPA_INET_H 1 +%define HAVE_CDIO_PARANOIA_H 0 +%define HAVE_CDIO_PARANOIA_PARANOIA_H 0 +%define HAVE_DEV_BKTR_IOCTL_BT848_H 0 +%define HAVE_DEV_BKTR_IOCTL_METEOR_H 0 +%define HAVE_DEV_IC_BT8XX_H 0 +%define HAVE_DEV_VIDEO_BKTR_IOCTL_BT848_H 0 +%define HAVE_DEV_VIDEO_METEOR_IOCTL_METEOR_H 0 +%define HAVE_DIRECT_H 0 +%define HAVE_DLFCN_H 1 +%define HAVE_DXVA_H 0 +%define HAVE_GSM_H 0 +%define HAVE_IO_H 0 +%define HAVE_MACH_MACH_TIME_H 0 +%define HAVE_MACHINE_IOCTL_BT848_H 0 +%define HAVE_MACHINE_IOCTL_METEOR_H 0 +%define HAVE_MALLOC_H 1 +%define HAVE_POLL_H 1 +%define HAVE_SNDIO_H 0 +%define HAVE_SOUNDCARD_H 0 +%define HAVE_SYS_MMAN_H 1 +%define HAVE_SYS_PARAM_H 1 +%define HAVE_SYS_RESOURCE_H 1 +%define HAVE_SYS_SELECT_H 1 +%define HAVE_SYS_SOUNDCARD_H 1 +%define HAVE_SYS_TIME_H 1 +%define HAVE_SYS_UN_H 1 +%define HAVE_SYS_VIDEOIO_H 0 +%define HAVE_UNISTD_H 1 +%define HAVE_WINDOWS_H 0 +%define HAVE_WINSOCK2_H 0 +%define HAVE_INTRINSICS_NEON 0 +%define HAVE_ATANF 1 +%define HAVE_ATAN2F 1 +%define HAVE_CBRTF 1 +%define HAVE_COSF 1 +%define HAVE_EXP2 1 +%define HAVE_EXP2F 1 +%define HAVE_EXPF 1 +%define HAVE_ISINF 1 +%define HAVE_ISNAN 1 +%define HAVE_LDEXPF 1 +%define HAVE_LLRINT 1 +%define HAVE_LLRINTF 1 +%define HAVE_LOG2 1 +%define HAVE_LOG2F 1 +%define HAVE_LOG10F 1 +%define HAVE_LRINT 1 +%define HAVE_LRINTF 1 +%define HAVE_POWF 1 +%define HAVE_RINT 1 +%define HAVE_ROUND 1 +%define HAVE_ROUNDF 1 +%define HAVE_SINF 1 +%define HAVE_TRUNC 1 +%define HAVE_TRUNCF 1 +%define HAVE_ALIGNED_MALLOC 0 +%define HAVE_CLOSESOCKET 0 +%define HAVE_COMMANDLINETOARGVW 0 +%define HAVE_COTASKMEMFREE 0 +%define HAVE_CRYPTGENRANDOM 0 +%define HAVE_DLOPEN 1 +%define HAVE_FCNTL 1 +%define HAVE_FLT_LIM 1 +%define HAVE_FORK 1 +%define HAVE_GETADDRINFO 1 +%define HAVE_GETHRTIME 0 +%define HAVE_GETOPT 1 +%define HAVE_GETPROCESSAFFINITYMASK 0 +%define HAVE_GETPROCESSMEMORYINFO 0 +%define HAVE_GETPROCESSTIMES 0 +%define HAVE_GETRUSAGE 1 +%define HAVE_GETSERVBYPORT 1 +%define HAVE_GETSYSTEMTIMEASFILETIME 0 +%define HAVE_GETTIMEOFDAY 1 +%define HAVE_INET_ATON 1 +%define HAVE_ISATTY 1 +%define HAVE_JACK_PORT_GET_LATENCY_RANGE 0 +%define HAVE_LOCALTIME_R 1 +%define HAVE_MACH_ABSOLUTE_TIME 0 +%define HAVE_MAPVIEWOFFILE 0 +%define HAVE_MEMALIGN 1 +%define HAVE_MKSTEMP 1 +%define HAVE_MMAP 1 +%define HAVE_MPROTECT 1 +%define HAVE_NANOSLEEP 1 +%define HAVE_POSIX_MEMALIGN 1 +%define HAVE_SCHED_GETAFFINITY 1 +%define HAVE_SETCONSOLETEXTATTRIBUTE 0 +%define HAVE_SETMODE 0 +%define HAVE_SETRLIMIT 1 +%define HAVE_SLEEP 0 +%define HAVE_STRERROR_R 1 +%define HAVE_STRPTIME 1 +%define HAVE_SYSCONF 1 +%define HAVE_SYSCTL 1 +%define HAVE_USLEEP 1 +%define HAVE_VIRTUALALLOC 0 +%define HAVE_PTHREADS 1 +%define HAVE_W32THREADS 0 +%define HAVE_AS_DN_DIRECTIVE 0 +%define HAVE_AS_FUNC 1 +%define HAVE_ASM_MOD_Q 0 +%define HAVE_ATTRIBUTE_MAY_ALIAS 1 +%define HAVE_ATTRIBUTE_PACKED 1 +%define HAVE_EBP_AVAILABLE 0 +%define HAVE_EBX_AVAILABLE 1 +%define HAVE_GNU_AS 1 +%define HAVE_IBM_ASM 0 +%define HAVE_INLINE_ASM_LABELS 1 +%define HAVE_PRAGMA_DEPRECATED 1 +%define HAVE_SYMVER_ASM_LABEL 0 +%define HAVE_SYMVER_GNU_ASM 1 +%define HAVE_VFP_ARGS 0 +%define HAVE_XFORM_ASM 0 +%define HAVE_XMM_CLOBBERS 1 +%define HAVE_SOCKLEN_T 1 +%define HAVE_STRUCT_ADDRINFO 1 +%define HAVE_STRUCT_GROUP_SOURCE_REQ 1 +%define HAVE_STRUCT_IP_MREQ_SOURCE 1 +%define HAVE_STRUCT_IPV6_MREQ 1 +%define HAVE_STRUCT_POLLFD 1 +%define HAVE_STRUCT_RUSAGE_RU_MAXRSS 1 +%define HAVE_STRUCT_SOCKADDR_IN6 1 +%define HAVE_STRUCT_SOCKADDR_SA_LEN 0 +%define HAVE_STRUCT_SOCKADDR_STORAGE 1 +%define HAVE_STRUCT_V4L2_FRMIVALENUM_DISCRETE 1 +%define HAVE_ATOMICS_NATIVE 1 +%define HAVE_DOS_PATHS 0 +%define HAVE_DXVA2_LIB 0 +%define HAVE_LIBC_MSVCRT 0 +%define HAVE_LIBDC1394_1 0 +%define HAVE_LIBDC1394_2 0 +%define HAVE_SDL 0 +%define HAVE_THREADS 1 +%define HAVE_VDPAU_X11 0 +%define HAVE_XLIB 1 +%define CONFIG_BSFS 1 +%define CONFIG_DECODERS 1 +%define CONFIG_DEMUXERS 1 +%define CONFIG_ENCODERS 1 +%define CONFIG_FILTERS 1 +%define CONFIG_HWACCELS 0 +%define CONFIG_INDEVS 1 +%define CONFIG_MUXERS 1 +%define CONFIG_OUTDEVS 1 +%define CONFIG_PARSERS 1 +%define CONFIG_PROTOCOLS 1 +%define CONFIG_AVCODEC_EXAMPLE 1 +%define CONFIG_FILTER_AUDIO_EXAMPLE 1 +%define CONFIG_METADATA_EXAMPLE 1 +%define CONFIG_OUTPUT_EXAMPLE 1 +%define CONFIG_TRANSCODE_AAC_EXAMPLE 1 +%define CONFIG_AVISYNTH 0 +%define CONFIG_BZLIB 1 +%define CONFIG_FREI0R 0 +%define CONFIG_GNUTLS 0 +%define CONFIG_LIBBS2B 0 +%define CONFIG_LIBCDIO 0 +%define CONFIG_LIBDC1394 0 +%define CONFIG_LIBFAAC 0 +%define CONFIG_LIBFDK_AAC 0 +%define CONFIG_LIBFONTCONFIG 0 +%define CONFIG_LIBFREETYPE 0 +%define CONFIG_LIBGSM 0 +%define CONFIG_LIBILBC 0 +%define CONFIG_LIBMP3LAME 0 +%define CONFIG_LIBOPENCORE_AMRNB 0 +%define CONFIG_LIBOPENCORE_AMRWB 0 +%define CONFIG_LIBOPENCV 0 +%define CONFIG_LIBOPENJPEG 0 +%define CONFIG_LIBOPUS 0 +%define CONFIG_LIBPULSE 0 +%define CONFIG_LIBRTMP 0 +%define CONFIG_LIBSCHROEDINGER 0 +%define CONFIG_LIBSPEEX 0 +%define CONFIG_LIBTHEORA 0 +%define CONFIG_LIBTWOLAME 0 +%define CONFIG_LIBVO_AACENC 0 +%define CONFIG_LIBVO_AMRWBENC 0 +%define CONFIG_LIBVORBIS 0 +%define CONFIG_LIBVPX 0 +%define CONFIG_LIBWAVPACK 0 +%define CONFIG_LIBWEBP 0 +%define CONFIG_LIBX264 0 +%define CONFIG_LIBX265 0 +%define CONFIG_LIBXAVS 0 +%define CONFIG_LIBXVID 0 +%define CONFIG_OPENSSL 0 +%define CONFIG_X11GRAB 0 +%define CONFIG_ZLIB 1 +%define CONFIG_GRAY 0 +%define CONFIG_HARDCODED_TABLES 0 +%define CONFIG_RUNTIME_CPUDETECT 0 +%define CONFIG_SAFE_BITSTREAM_READER 1 +%define CONFIG_SHARED 0 +%define CONFIG_SMALL 0 +%define CONFIG_SRAM 0 +%define CONFIG_STATIC 1 +%define CONFIG_SWSCALE_ALPHA 1 +%define CONFIG_DXVA2 0 +%define CONFIG_VAAPI 0 +%define CONFIG_VDA 0 +%define CONFIG_VDPAU 0 +%define CONFIG_GPL 0 +%define CONFIG_NONFREE 0 +%define CONFIG_VERSION3 0 +%define CONFIG_AVCODEC 1 +%define CONFIG_AVDEVICE 1 +%define CONFIG_AVFILTER 1 +%define CONFIG_AVFORMAT 1 +%define CONFIG_AVRESAMPLE 1 +%define CONFIG_AVUTIL 1 +%define CONFIG_SWSCALE 1 +%define CONFIG_AVCONV 1 +%define CONFIG_AVPLAY 0 +%define CONFIG_AVPROBE 1 +%define CONFIG_DCT 1 +%define CONFIG_DOC 1 +%define CONFIG_ERROR_RESILIENCE 1 +%define CONFIG_FFT 1 +%define CONFIG_LSP 1 +%define CONFIG_LZO 1 +%define CONFIG_MDCT 1 +%define CONFIG_NETWORK 1 +%define CONFIG_RDFT 1 +%define CONFIG_MEMALIGN_HACK 0 +%define CONFIG_NEON_CLOBBER_TEST 0 +%define CONFIG_PIC 0 +%define CONFIG_POD2MAN 1 +%define CONFIG_TEXI2HTML 0 +%define CONFIG_THUMB 0 +%define CONFIG_XMM_CLOBBER_TEST 0 +%define CONFIG_AANDCTTABLES 1 +%define CONFIG_AC3DSP 1 +%define CONFIG_AUDIO_FRAME_QUEUE 1 +%define CONFIG_AUDIODSP 1 +%define CONFIG_BLOCKDSP 1 +%define CONFIG_BSWAPDSP 1 +%define CONFIG_CABAC 1 +%define CONFIG_DVPROFILE 1 +%define CONFIG_FDCTDSP 1 +%define CONFIG_GCRYPT 0 +%define CONFIG_GOLOMB 1 +%define CONFIG_GPLV3 0 +%define CONFIG_H263DSP 1 +%define CONFIG_H264CHROMA 1 +%define CONFIG_H264DSP 1 +%define CONFIG_H264PRED 1 +%define CONFIG_H264QPEL 1 +%define CONFIG_HPELDSP 1 +%define CONFIG_HUFFMAN 1 +%define CONFIG_HUFFYUVDSP 1 +%define CONFIG_HUFFYUVENCDSP 1 +%define CONFIG_IDCTDSP 1 +%define CONFIG_IIRFILTER 1 +%define CONFIG_INTRAX8 1 +%define CONFIG_LGPLV3 0 +%define CONFIG_LPC 1 +%define CONFIG_ME_CMP 1 +%define CONFIG_MPEG_ER 1 +%define CONFIG_MPEGAUDIO 1 +%define CONFIG_MPEGAUDIODSP 1 +%define CONFIG_MPEGVIDEO 1 +%define CONFIG_MPEGVIDEOENC 1 +%define CONFIG_NETTLE 0 +%define CONFIG_PIXBLOCKDSP 1 +%define CONFIG_QPELDSP 1 +%define CONFIG_RANGECODER 1 +%define CONFIG_RIFFDEC 1 +%define CONFIG_RIFFENC 1 +%define CONFIG_RTPDEC 1 +%define CONFIG_RTPENC_CHAIN 1 +%define CONFIG_SINEWIN 1 +%define CONFIG_TPELDSP 1 +%define CONFIG_VIDEODSP 1 +%define CONFIG_VP3DSP 1 +%define CONFIG_AAC_ADTSTOASC_BSF 1 +%define CONFIG_CHOMP_BSF 1 +%define CONFIG_DUMP_EXTRADATA_BSF 1 +%define CONFIG_H264_MP4TOANNEXB_BSF 1 +%define CONFIG_IMX_DUMP_HEADER_BSF 1 +%define CONFIG_MJPEG2JPEG_BSF 1 +%define CONFIG_MJPEGA_DUMP_HEADER_BSF 1 +%define CONFIG_MOV2TEXTSUB_BSF 1 +%define CONFIG_NOISE_BSF 1 +%define CONFIG_REMOVE_EXTRADATA_BSF 1 +%define CONFIG_TEXT2MOVSUB_BSF 1 +%define CONFIG_AASC_DECODER 1 +%define CONFIG_AIC_DECODER 1 +%define CONFIG_ALIAS_PIX_DECODER 1 +%define CONFIG_AMV_DECODER 1 +%define CONFIG_ANM_DECODER 1 +%define CONFIG_ANSI_DECODER 1 +%define CONFIG_ASV1_DECODER 1 +%define CONFIG_ASV2_DECODER 1 +%define CONFIG_AURA_DECODER 1 +%define CONFIG_AURA2_DECODER 1 +%define CONFIG_AVS_DECODER 1 +%define CONFIG_BETHSOFTVID_DECODER 1 +%define CONFIG_BFI_DECODER 1 +%define CONFIG_BINK_DECODER 1 +%define CONFIG_BMP_DECODER 1 +%define CONFIG_BMV_VIDEO_DECODER 1 +%define CONFIG_BRENDER_PIX_DECODER 1 +%define CONFIG_C93_DECODER 1 +%define CONFIG_CAVS_DECODER 1 +%define CONFIG_CDGRAPHICS_DECODER 1 +%define CONFIG_CDXL_DECODER 1 +%define CONFIG_CINEPAK_DECODER 1 +%define CONFIG_CLJR_DECODER 1 +%define CONFIG_CLLC_DECODER 1 +%define CONFIG_COMFORTNOISE_DECODER 1 +%define CONFIG_CSCD_DECODER 1 +%define CONFIG_CYUV_DECODER 1 +%define CONFIG_DFA_DECODER 1 +%define CONFIG_DNXHD_DECODER 1 +%define CONFIG_DPX_DECODER 1 +%define CONFIG_DSICINVIDEO_DECODER 1 +%define CONFIG_DVVIDEO_DECODER 1 +%define CONFIG_DXA_DECODER 1 +%define CONFIG_DXTORY_DECODER 1 +%define CONFIG_EACMV_DECODER 1 +%define CONFIG_EAMAD_DECODER 1 +%define CONFIG_EATGQ_DECODER 1 +%define CONFIG_EATGV_DECODER 1 +%define CONFIG_EATQI_DECODER 1 +%define CONFIG_EIGHTBPS_DECODER 1 +%define CONFIG_EIGHTSVX_EXP_DECODER 1 +%define CONFIG_EIGHTSVX_FIB_DECODER 1 +%define CONFIG_ESCAPE124_DECODER 1 +%define CONFIG_ESCAPE130_DECODER 1 +%define CONFIG_EXR_DECODER 1 +%define CONFIG_FFV1_DECODER 1 +%define CONFIG_FFVHUFF_DECODER 1 +%define CONFIG_FIC_DECODER 1 +%define CONFIG_FLASHSV_DECODER 1 +%define CONFIG_FLASHSV2_DECODER 1 +%define CONFIG_FLIC_DECODER 1 +%define CONFIG_FLV_DECODER 1 +%define CONFIG_FOURXM_DECODER 1 +%define CONFIG_FRAPS_DECODER 1 +%define CONFIG_FRWU_DECODER 1 +%define CONFIG_G2M_DECODER 1 +%define CONFIG_GIF_DECODER 1 +%define CONFIG_H261_DECODER 1 +%define CONFIG_H263_DECODER 1 +%define CONFIG_H263I_DECODER 1 +%define CONFIG_H264_DECODER 1 +%define CONFIG_HEVC_DECODER 1 +%define CONFIG_HNM4_VIDEO_DECODER 1 +%define CONFIG_HUFFYUV_DECODER 1 +%define CONFIG_IDCIN_DECODER 1 +%define CONFIG_IFF_BYTERUN1_DECODER 1 +%define CONFIG_IFF_ILBM_DECODER 1 +%define CONFIG_INDEO2_DECODER 1 +%define CONFIG_INDEO3_DECODER 1 +%define CONFIG_INDEO4_DECODER 1 +%define CONFIG_INDEO5_DECODER 1 +%define CONFIG_INTERPLAY_VIDEO_DECODER 1 +%define CONFIG_JPEG2000_DECODER 1 +%define CONFIG_JPEGLS_DECODER 1 +%define CONFIG_JV_DECODER 1 +%define CONFIG_KGV1_DECODER 1 +%define CONFIG_KMVC_DECODER 1 +%define CONFIG_LAGARITH_DECODER 1 +%define CONFIG_LOCO_DECODER 1 +%define CONFIG_MDEC_DECODER 1 +%define CONFIG_MIMIC_DECODER 1 +%define CONFIG_MJPEG_DECODER 1 +%define CONFIG_MJPEGB_DECODER 1 +%define CONFIG_MMVIDEO_DECODER 1 +%define CONFIG_MOTIONPIXELS_DECODER 1 +%define CONFIG_MPEG_XVMC_DECODER 0 +%define CONFIG_MPEG1VIDEO_DECODER 1 +%define CONFIG_MPEG2VIDEO_DECODER 1 +%define CONFIG_MPEG4_DECODER 1 +%define CONFIG_MSA1_DECODER 1 +%define CONFIG_MSMPEG4V1_DECODER 1 +%define CONFIG_MSMPEG4V2_DECODER 1 +%define CONFIG_MSMPEG4V3_DECODER 1 +%define CONFIG_MSRLE_DECODER 1 +%define CONFIG_MSS1_DECODER 1 +%define CONFIG_MSS2_DECODER 1 +%define CONFIG_MSVIDEO1_DECODER 1 +%define CONFIG_MSZH_DECODER 1 +%define CONFIG_MTS2_DECODER 1 +%define CONFIG_MVC1_DECODER 1 +%define CONFIG_MVC2_DECODER 1 +%define CONFIG_MXPEG_DECODER 1 +%define CONFIG_NUV_DECODER 1 +%define CONFIG_PAF_VIDEO_DECODER 1 +%define CONFIG_PAM_DECODER 1 +%define CONFIG_PBM_DECODER 1 +%define CONFIG_PCX_DECODER 1 +%define CONFIG_PGM_DECODER 1 +%define CONFIG_PGMYUV_DECODER 1 +%define CONFIG_PICTOR_DECODER 1 +%define CONFIG_PNG_DECODER 1 +%define CONFIG_PPM_DECODER 1 +%define CONFIG_PRORES_DECODER 1 +%define CONFIG_PTX_DECODER 1 +%define CONFIG_QDRAW_DECODER 1 +%define CONFIG_QPEG_DECODER 1 +%define CONFIG_QTRLE_DECODER 1 +%define CONFIG_R10K_DECODER 1 +%define CONFIG_R210_DECODER 1 +%define CONFIG_RAWVIDEO_DECODER 1 +%define CONFIG_RL2_DECODER 1 +%define CONFIG_ROQ_DECODER 1 +%define CONFIG_RPZA_DECODER 1 +%define CONFIG_RV10_DECODER 1 +%define CONFIG_RV20_DECODER 1 +%define CONFIG_RV30_DECODER 1 +%define CONFIG_RV40_DECODER 1 +%define CONFIG_S302M_DECODER 1 +%define CONFIG_SANM_DECODER 1 +%define CONFIG_SGI_DECODER 1 +%define CONFIG_SGIRLE_DECODER 1 +%define CONFIG_SMACKER_DECODER 1 +%define CONFIG_SMC_DECODER 1 +%define CONFIG_SP5X_DECODER 1 +%define CONFIG_SUNRAST_DECODER 1 +%define CONFIG_SVQ1_DECODER 1 +%define CONFIG_SVQ3_DECODER 1 +%define CONFIG_TARGA_DECODER 1 +%define CONFIG_THEORA_DECODER 1 +%define CONFIG_THP_DECODER 1 +%define CONFIG_TIERTEXSEQVIDEO_DECODER 1 +%define CONFIG_TIFF_DECODER 1 +%define CONFIG_TMV_DECODER 1 +%define CONFIG_TRUEMOTION1_DECODER 1 +%define CONFIG_TRUEMOTION2_DECODER 1 +%define CONFIG_TSCC_DECODER 1 +%define CONFIG_TSCC2_DECODER 1 +%define CONFIG_TXD_DECODER 1 +%define CONFIG_ULTI_DECODER 1 +%define CONFIG_UTVIDEO_DECODER 1 +%define CONFIG_V210_DECODER 1 +%define CONFIG_V210X_DECODER 1 +%define CONFIG_V410_DECODER 1 +%define CONFIG_VB_DECODER 1 +%define CONFIG_VBLE_DECODER 1 +%define CONFIG_VC1_DECODER 1 +%define CONFIG_VC1IMAGE_DECODER 1 +%define CONFIG_VCR1_DECODER 1 +%define CONFIG_VMDVIDEO_DECODER 1 +%define CONFIG_VMNC_DECODER 1 +%define CONFIG_VP3_DECODER 1 +%define CONFIG_VP5_DECODER 1 +%define CONFIG_VP6_DECODER 1 +%define CONFIG_VP6A_DECODER 1 +%define CONFIG_VP6F_DECODER 1 +%define CONFIG_VP7_DECODER 1 +%define CONFIG_VP8_DECODER 1 +%define CONFIG_VP9_DECODER 1 +%define CONFIG_VQA_DECODER 1 +%define CONFIG_WEBP_DECODER 1 +%define CONFIG_WMV1_DECODER 1 +%define CONFIG_WMV2_DECODER 1 +%define CONFIG_WMV3_DECODER 1 +%define CONFIG_WMV3IMAGE_DECODER 1 +%define CONFIG_WNV1_DECODER 1 +%define CONFIG_XAN_WC3_DECODER 1 +%define CONFIG_XAN_WC4_DECODER 1 +%define CONFIG_XBM_DECODER 1 +%define CONFIG_XL_DECODER 1 +%define CONFIG_XWD_DECODER 1 +%define CONFIG_YOP_DECODER 1 +%define CONFIG_ZEROCODEC_DECODER 1 +%define CONFIG_ZLIB_DECODER 1 +%define CONFIG_ZMBV_DECODER 1 +%define CONFIG_AAC_DECODER 1 +%define CONFIG_AAC_LATM_DECODER 1 +%define CONFIG_AC3_DECODER 1 +%define CONFIG_ALAC_DECODER 1 +%define CONFIG_ALS_DECODER 1 +%define CONFIG_AMRNB_DECODER 1 +%define CONFIG_AMRWB_DECODER 1 +%define CONFIG_APE_DECODER 1 +%define CONFIG_ATRAC1_DECODER 1 +%define CONFIG_ATRAC3_DECODER 1 +%define CONFIG_ATRAC3P_DECODER 1 +%define CONFIG_BINKAUDIO_DCT_DECODER 1 +%define CONFIG_BINKAUDIO_RDFT_DECODER 1 +%define CONFIG_BMV_AUDIO_DECODER 1 +%define CONFIG_COOK_DECODER 1 +%define CONFIG_DCA_DECODER 1 +%define CONFIG_DSICINAUDIO_DECODER 1 +%define CONFIG_EAC3_DECODER 1 +%define CONFIG_FLAC_DECODER 1 +%define CONFIG_G723_1_DECODER 1 +%define CONFIG_GSM_DECODER 1 +%define CONFIG_GSM_MS_DECODER 1 +%define CONFIG_IAC_DECODER 1 +%define CONFIG_IMC_DECODER 1 +%define CONFIG_MACE3_DECODER 1 +%define CONFIG_MACE6_DECODER 1 +%define CONFIG_METASOUND_DECODER 1 +%define CONFIG_MLP_DECODER 1 +%define CONFIG_MP1_DECODER 1 +%define CONFIG_MP1FLOAT_DECODER 1 +%define CONFIG_MP2_DECODER 1 +%define CONFIG_MP2FLOAT_DECODER 1 +%define CONFIG_MP3_DECODER 1 +%define CONFIG_MP3FLOAT_DECODER 1 +%define CONFIG_MP3ADU_DECODER 1 +%define CONFIG_MP3ADUFLOAT_DECODER 1 +%define CONFIG_MP3ON4_DECODER 1 +%define CONFIG_MP3ON4FLOAT_DECODER 1 +%define CONFIG_MPC7_DECODER 1 +%define CONFIG_MPC8_DECODER 1 +%define CONFIG_NELLYMOSER_DECODER 1 +%define CONFIG_ON2AVC_DECODER 1 +%define CONFIG_OPUS_DECODER 1 +%define CONFIG_PAF_AUDIO_DECODER 1 +%define CONFIG_QCELP_DECODER 1 +%define CONFIG_QDM2_DECODER 1 +%define CONFIG_RA_144_DECODER 1 +%define CONFIG_RA_288_DECODER 1 +%define CONFIG_RALF_DECODER 1 +%define CONFIG_SHORTEN_DECODER 1 +%define CONFIG_SIPR_DECODER 1 +%define CONFIG_SMACKAUD_DECODER 1 +%define CONFIG_TAK_DECODER 1 +%define CONFIG_TRUEHD_DECODER 1 +%define CONFIG_TRUESPEECH_DECODER 1 +%define CONFIG_TTA_DECODER 1 +%define CONFIG_TWINVQ_DECODER 1 +%define CONFIG_VMDAUDIO_DECODER 1 +%define CONFIG_VORBIS_DECODER 1 +%define CONFIG_WAVPACK_DECODER 1 +%define CONFIG_WMALOSSLESS_DECODER 1 +%define CONFIG_WMAPRO_DECODER 1 +%define CONFIG_WMAV1_DECODER 1 +%define CONFIG_WMAV2_DECODER 1 +%define CONFIG_WMAVOICE_DECODER 1 +%define CONFIG_WS_SND1_DECODER 1 +%define CONFIG_PCM_ALAW_DECODER 1 +%define CONFIG_PCM_BLURAY_DECODER 1 +%define CONFIG_PCM_DVD_DECODER 1 +%define CONFIG_PCM_F32BE_DECODER 1 +%define CONFIG_PCM_F32LE_DECODER 1 +%define CONFIG_PCM_F64BE_DECODER 1 +%define CONFIG_PCM_F64LE_DECODER 1 +%define CONFIG_PCM_LXF_DECODER 1 +%define CONFIG_PCM_MULAW_DECODER 1 +%define CONFIG_PCM_S8_DECODER 1 +%define CONFIG_PCM_S8_PLANAR_DECODER 1 +%define CONFIG_PCM_S16BE_DECODER 1 +%define CONFIG_PCM_S16LE_DECODER 1 +%define CONFIG_PCM_S16LE_PLANAR_DECODER 1 +%define CONFIG_PCM_S24BE_DECODER 1 +%define CONFIG_PCM_S24DAUD_DECODER 1 +%define CONFIG_PCM_S24LE_DECODER 1 +%define CONFIG_PCM_S24LE_PLANAR_DECODER 1 +%define CONFIG_PCM_S32BE_DECODER 1 +%define CONFIG_PCM_S32LE_DECODER 1 +%define CONFIG_PCM_S32LE_PLANAR_DECODER 1 +%define CONFIG_PCM_U8_DECODER 1 +%define CONFIG_PCM_U16BE_DECODER 1 +%define CONFIG_PCM_U16LE_DECODER 1 +%define CONFIG_PCM_U24BE_DECODER 1 +%define CONFIG_PCM_U24LE_DECODER 1 +%define CONFIG_PCM_U32BE_DECODER 1 +%define CONFIG_PCM_U32LE_DECODER 1 +%define CONFIG_PCM_ZORK_DECODER 1 +%define CONFIG_INTERPLAY_DPCM_DECODER 1 +%define CONFIG_ROQ_DPCM_DECODER 1 +%define CONFIG_SOL_DPCM_DECODER 1 +%define CONFIG_XAN_DPCM_DECODER 1 +%define CONFIG_ADPCM_4XM_DECODER 1 +%define CONFIG_ADPCM_ADX_DECODER 1 +%define CONFIG_ADPCM_CT_DECODER 1 +%define CONFIG_ADPCM_EA_DECODER 1 +%define CONFIG_ADPCM_EA_MAXIS_XA_DECODER 1 +%define CONFIG_ADPCM_EA_R1_DECODER 1 +%define CONFIG_ADPCM_EA_R2_DECODER 1 +%define CONFIG_ADPCM_EA_R3_DECODER 1 +%define CONFIG_ADPCM_EA_XAS_DECODER 1 +%define CONFIG_ADPCM_G722_DECODER 1 +%define CONFIG_ADPCM_G726_DECODER 1 +%define CONFIG_ADPCM_IMA_AMV_DECODER 1 +%define CONFIG_ADPCM_IMA_APC_DECODER 1 +%define CONFIG_ADPCM_IMA_DK3_DECODER 1 +%define CONFIG_ADPCM_IMA_DK4_DECODER 1 +%define CONFIG_ADPCM_IMA_EA_EACS_DECODER 1 +%define CONFIG_ADPCM_IMA_EA_SEAD_DECODER 1 +%define CONFIG_ADPCM_IMA_ISS_DECODER 1 +%define CONFIG_ADPCM_IMA_QT_DECODER 1 +%define CONFIG_ADPCM_IMA_SMJPEG_DECODER 1 +%define CONFIG_ADPCM_IMA_WAV_DECODER 1 +%define CONFIG_ADPCM_IMA_WS_DECODER 1 +%define CONFIG_ADPCM_MS_DECODER 1 +%define CONFIG_ADPCM_SBPRO_2_DECODER 1 +%define CONFIG_ADPCM_SBPRO_3_DECODER 1 +%define CONFIG_ADPCM_SBPRO_4_DECODER 1 +%define CONFIG_ADPCM_SWF_DECODER 1 +%define CONFIG_ADPCM_THP_DECODER 1 +%define CONFIG_ADPCM_VIMA_DECODER 1 +%define CONFIG_ADPCM_XA_DECODER 1 +%define CONFIG_ADPCM_YAMAHA_DECODER 1 +%define CONFIG_ASS_DECODER 1 +%define CONFIG_DVBSUB_DECODER 1 +%define CONFIG_DVDSUB_DECODER 1 +%define CONFIG_PGSSUB_DECODER 1 +%define CONFIG_SRT_DECODER 1 +%define CONFIG_XSUB_DECODER 1 +%define CONFIG_LIBFDK_AAC_DECODER 0 +%define CONFIG_LIBGSM_DECODER 0 +%define CONFIG_LIBGSM_MS_DECODER 0 +%define CONFIG_LIBILBC_DECODER 0 +%define CONFIG_LIBOPENCORE_AMRNB_DECODER 0 +%define CONFIG_LIBOPENCORE_AMRWB_DECODER 0 +%define CONFIG_LIBOPENJPEG_DECODER 0 +%define CONFIG_LIBOPUS_DECODER 0 +%define CONFIG_LIBSCHROEDINGER_DECODER 0 +%define CONFIG_LIBSPEEX_DECODER 0 +%define CONFIG_LIBVPX_VP8_DECODER 0 +%define CONFIG_LIBVPX_VP9_DECODER 0 +%define CONFIG_AAC_DEMUXER 1 +%define CONFIG_AC3_DEMUXER 1 +%define CONFIG_ADX_DEMUXER 1 +%define CONFIG_AEA_DEMUXER 1 +%define CONFIG_AIFF_DEMUXER 1 +%define CONFIG_AMR_DEMUXER 1 +%define CONFIG_ANM_DEMUXER 1 +%define CONFIG_APC_DEMUXER 1 +%define CONFIG_APE_DEMUXER 1 +%define CONFIG_ASF_DEMUXER 1 +%define CONFIG_ASS_DEMUXER 1 +%define CONFIG_AU_DEMUXER 1 +%define CONFIG_AVI_DEMUXER 1 +%define CONFIG_AVISYNTH_DEMUXER 0 +%define CONFIG_AVS_DEMUXER 1 +%define CONFIG_BETHSOFTVID_DEMUXER 1 +%define CONFIG_BFI_DEMUXER 1 +%define CONFIG_BINK_DEMUXER 1 +%define CONFIG_BMV_DEMUXER 1 +%define CONFIG_C93_DEMUXER 1 +%define CONFIG_CAF_DEMUXER 1 +%define CONFIG_CAVSVIDEO_DEMUXER 1 +%define CONFIG_CDG_DEMUXER 1 +%define CONFIG_CDXL_DEMUXER 1 +%define CONFIG_DAUD_DEMUXER 1 +%define CONFIG_DFA_DEMUXER 1 +%define CONFIG_DIRAC_DEMUXER 1 +%define CONFIG_DNXHD_DEMUXER 1 +%define CONFIG_DSICIN_DEMUXER 1 +%define CONFIG_DTS_DEMUXER 1 +%define CONFIG_DV_DEMUXER 1 +%define CONFIG_DXA_DEMUXER 1 +%define CONFIG_EA_DEMUXER 1 +%define CONFIG_EA_CDATA_DEMUXER 1 +%define CONFIG_EAC3_DEMUXER 1 +%define CONFIG_FFMETADATA_DEMUXER 1 +%define CONFIG_FILMSTRIP_DEMUXER 1 +%define CONFIG_FLAC_DEMUXER 1 +%define CONFIG_FLIC_DEMUXER 1 +%define CONFIG_FLV_DEMUXER 1 +%define CONFIG_FOURXM_DEMUXER 1 +%define CONFIG_G722_DEMUXER 1 +%define CONFIG_G723_1_DEMUXER 1 +%define CONFIG_GSM_DEMUXER 1 +%define CONFIG_GXF_DEMUXER 1 +%define CONFIG_H261_DEMUXER 1 +%define CONFIG_H263_DEMUXER 1 +%define CONFIG_H264_DEMUXER 1 +%define CONFIG_HEVC_DEMUXER 1 +%define CONFIG_HLS_DEMUXER 1 +%define CONFIG_HNM_DEMUXER 1 +%define CONFIG_IDCIN_DEMUXER 1 +%define CONFIG_IFF_DEMUXER 1 +%define CONFIG_ILBC_DEMUXER 1 +%define CONFIG_IMAGE2_DEMUXER 1 +%define CONFIG_IMAGE2PIPE_DEMUXER 1 +%define CONFIG_INGENIENT_DEMUXER 1 +%define CONFIG_IPMOVIE_DEMUXER 1 +%define CONFIG_ISS_DEMUXER 1 +%define CONFIG_IV8_DEMUXER 1 +%define CONFIG_IVF_DEMUXER 1 +%define CONFIG_JV_DEMUXER 1 +%define CONFIG_LATM_DEMUXER 1 +%define CONFIG_LMLM4_DEMUXER 1 +%define CONFIG_LXF_DEMUXER 1 +%define CONFIG_M4V_DEMUXER 1 +%define CONFIG_MATROSKA_DEMUXER 1 +%define CONFIG_MJPEG_DEMUXER 1 +%define CONFIG_MLP_DEMUXER 1 +%define CONFIG_MM_DEMUXER 1 +%define CONFIG_MMF_DEMUXER 1 +%define CONFIG_MOV_DEMUXER 1 +%define CONFIG_MP3_DEMUXER 1 +%define CONFIG_MPC_DEMUXER 1 +%define CONFIG_MPC8_DEMUXER 1 +%define CONFIG_MPEGPS_DEMUXER 1 +%define CONFIG_MPEGTS_DEMUXER 1 +%define CONFIG_MPEGTSRAW_DEMUXER 1 +%define CONFIG_MPEGVIDEO_DEMUXER 1 +%define CONFIG_MSNWC_TCP_DEMUXER 1 +%define CONFIG_MTV_DEMUXER 1 +%define CONFIG_MV_DEMUXER 1 +%define CONFIG_MVI_DEMUXER 1 +%define CONFIG_MXF_DEMUXER 1 +%define CONFIG_MXG_DEMUXER 1 +%define CONFIG_NC_DEMUXER 1 +%define CONFIG_NSV_DEMUXER 1 +%define CONFIG_NUT_DEMUXER 1 +%define CONFIG_NUV_DEMUXER 1 +%define CONFIG_OGG_DEMUXER 1 +%define CONFIG_OMA_DEMUXER 1 +%define CONFIG_PAF_DEMUXER 1 +%define CONFIG_PCM_ALAW_DEMUXER 1 +%define CONFIG_PCM_MULAW_DEMUXER 1 +%define CONFIG_PCM_F64BE_DEMUXER 1 +%define CONFIG_PCM_F64LE_DEMUXER 1 +%define CONFIG_PCM_F32BE_DEMUXER 1 +%define CONFIG_PCM_F32LE_DEMUXER 1 +%define CONFIG_PCM_S32BE_DEMUXER 1 +%define CONFIG_PCM_S32LE_DEMUXER 1 +%define CONFIG_PCM_S24BE_DEMUXER 1 +%define CONFIG_PCM_S24LE_DEMUXER 1 +%define CONFIG_PCM_S16BE_DEMUXER 1 +%define CONFIG_PCM_S16LE_DEMUXER 1 +%define CONFIG_PCM_S8_DEMUXER 1 +%define CONFIG_PCM_U32BE_DEMUXER 1 +%define CONFIG_PCM_U32LE_DEMUXER 1 +%define CONFIG_PCM_U24BE_DEMUXER 1 +%define CONFIG_PCM_U24LE_DEMUXER 1 +%define CONFIG_PCM_U16BE_DEMUXER 1 +%define CONFIG_PCM_U16LE_DEMUXER 1 +%define CONFIG_PCM_U8_DEMUXER 1 +%define CONFIG_PMP_DEMUXER 1 +%define CONFIG_PVA_DEMUXER 1 +%define CONFIG_QCP_DEMUXER 1 +%define CONFIG_R3D_DEMUXER 1 +%define CONFIG_RAWVIDEO_DEMUXER 1 +%define CONFIG_RL2_DEMUXER 1 +%define CONFIG_RM_DEMUXER 1 +%define CONFIG_ROQ_DEMUXER 1 +%define CONFIG_RPL_DEMUXER 1 +%define CONFIG_RSO_DEMUXER 1 +%define CONFIG_RTP_DEMUXER 1 +%define CONFIG_RTSP_DEMUXER 1 +%define CONFIG_SAP_DEMUXER 1 +%define CONFIG_SDP_DEMUXER 1 +%define CONFIG_SEGAFILM_DEMUXER 1 +%define CONFIG_SHORTEN_DEMUXER 1 +%define CONFIG_SIFF_DEMUXER 1 +%define CONFIG_SMACKER_DEMUXER 1 +%define CONFIG_SMJPEG_DEMUXER 1 +%define CONFIG_SMUSH_DEMUXER 1 +%define CONFIG_SOL_DEMUXER 1 +%define CONFIG_SOX_DEMUXER 1 +%define CONFIG_SPDIF_DEMUXER 1 +%define CONFIG_SRT_DEMUXER 1 +%define CONFIG_STR_DEMUXER 1 +%define CONFIG_SWF_DEMUXER 1 +%define CONFIG_TAK_DEMUXER 1 +%define CONFIG_THP_DEMUXER 1 +%define CONFIG_TIERTEXSEQ_DEMUXER 1 +%define CONFIG_TMV_DEMUXER 1 +%define CONFIG_TRUEHD_DEMUXER 1 +%define CONFIG_TTA_DEMUXER 1 +%define CONFIG_TXD_DEMUXER 1 +%define CONFIG_TTY_DEMUXER 1 +%define CONFIG_VC1_DEMUXER 1 +%define CONFIG_VC1T_DEMUXER 1 +%define CONFIG_VMD_DEMUXER 1 +%define CONFIG_VOC_DEMUXER 1 +%define CONFIG_VQF_DEMUXER 1 +%define CONFIG_W64_DEMUXER 1 +%define CONFIG_WAV_DEMUXER 1 +%define CONFIG_WC3_DEMUXER 1 +%define CONFIG_WSAUD_DEMUXER 1 +%define CONFIG_WSVQA_DEMUXER 1 +%define CONFIG_WTV_DEMUXER 1 +%define CONFIG_WV_DEMUXER 1 +%define CONFIG_XA_DEMUXER 1 +%define CONFIG_XMV_DEMUXER 1 +%define CONFIG_XWMA_DEMUXER 1 +%define CONFIG_YOP_DEMUXER 1 +%define CONFIG_YUV4MPEGPIPE_DEMUXER 1 +%define CONFIG_A64MULTI_ENCODER 1 +%define CONFIG_A64MULTI5_ENCODER 1 +%define CONFIG_ALIAS_PIX_ENCODER 1 +%define CONFIG_ASV1_ENCODER 1 +%define CONFIG_ASV2_ENCODER 1 +%define CONFIG_BMP_ENCODER 1 +%define CONFIG_CLJR_ENCODER 1 +%define CONFIG_COMFORTNOISE_ENCODER 1 +%define CONFIG_DNXHD_ENCODER 1 +%define CONFIG_DPX_ENCODER 1 +%define CONFIG_DVVIDEO_ENCODER 1 +%define CONFIG_FFV1_ENCODER 1 +%define CONFIG_FFVHUFF_ENCODER 1 +%define CONFIG_FLASHSV_ENCODER 1 +%define CONFIG_FLV_ENCODER 1 +%define CONFIG_GIF_ENCODER 1 +%define CONFIG_H261_ENCODER 1 +%define CONFIG_H263_ENCODER 1 +%define CONFIG_H263P_ENCODER 1 +%define CONFIG_HUFFYUV_ENCODER 1 +%define CONFIG_JPEGLS_ENCODER 1 +%define CONFIG_LJPEG_ENCODER 1 +%define CONFIG_MJPEG_ENCODER 1 +%define CONFIG_MPEG1VIDEO_ENCODER 1 +%define CONFIG_MPEG2VIDEO_ENCODER 1 +%define CONFIG_MPEG4_ENCODER 1 +%define CONFIG_MSMPEG4V2_ENCODER 1 +%define CONFIG_MSMPEG4V3_ENCODER 1 +%define CONFIG_PAM_ENCODER 1 +%define CONFIG_PBM_ENCODER 1 +%define CONFIG_PCX_ENCODER 1 +%define CONFIG_PGM_ENCODER 1 +%define CONFIG_PGMYUV_ENCODER 1 +%define CONFIG_PNG_ENCODER 1 +%define CONFIG_PPM_ENCODER 1 +%define CONFIG_PRORES_ENCODER 1 +%define CONFIG_QTRLE_ENCODER 1 +%define CONFIG_RAWVIDEO_ENCODER 1 +%define CONFIG_ROQ_ENCODER 1 +%define CONFIG_RV10_ENCODER 1 +%define CONFIG_RV20_ENCODER 1 +%define CONFIG_SGI_ENCODER 1 +%define CONFIG_SUNRAST_ENCODER 1 +%define CONFIG_SVQ1_ENCODER 1 +%define CONFIG_TARGA_ENCODER 1 +%define CONFIG_LIBTWOLAME_ENCODER 0 +%define CONFIG_TIFF_ENCODER 1 +%define CONFIG_UTVIDEO_ENCODER 1 +%define CONFIG_V210_ENCODER 1 +%define CONFIG_V410_ENCODER 1 +%define CONFIG_WMV1_ENCODER 1 +%define CONFIG_WMV2_ENCODER 1 +%define CONFIG_XBM_ENCODER 1 +%define CONFIG_XWD_ENCODER 1 +%define CONFIG_ZLIB_ENCODER 1 +%define CONFIG_ZMBV_ENCODER 1 +%define CONFIG_AAC_ENCODER 1 +%define CONFIG_AC3_ENCODER 1 +%define CONFIG_AC3_FIXED_ENCODER 1 +%define CONFIG_ALAC_ENCODER 1 +%define CONFIG_EAC3_ENCODER 1 +%define CONFIG_FLAC_ENCODER 1 +%define CONFIG_MP2_ENCODER 1 +%define CONFIG_NELLYMOSER_ENCODER 1 +%define CONFIG_RA_144_ENCODER 1 +%define CONFIG_VORBIS_ENCODER 1 +%define CONFIG_WMAV1_ENCODER 1 +%define CONFIG_WMAV2_ENCODER 1 +%define CONFIG_PCM_ALAW_ENCODER 1 +%define CONFIG_PCM_F32BE_ENCODER 1 +%define CONFIG_PCM_F32LE_ENCODER 1 +%define CONFIG_PCM_F64BE_ENCODER 1 +%define CONFIG_PCM_F64LE_ENCODER 1 +%define CONFIG_PCM_MULAW_ENCODER 1 +%define CONFIG_PCM_S8_ENCODER 1 +%define CONFIG_PCM_S16BE_ENCODER 1 +%define CONFIG_PCM_S16LE_ENCODER 1 +%define CONFIG_PCM_S24BE_ENCODER 1 +%define CONFIG_PCM_S24DAUD_ENCODER 1 +%define CONFIG_PCM_S24LE_ENCODER 1 +%define CONFIG_PCM_S32BE_ENCODER 1 +%define CONFIG_PCM_S32LE_ENCODER 1 +%define CONFIG_PCM_U8_ENCODER 1 +%define CONFIG_PCM_U16BE_ENCODER 1 +%define CONFIG_PCM_U16LE_ENCODER 1 +%define CONFIG_PCM_U24BE_ENCODER 1 +%define CONFIG_PCM_U24LE_ENCODER 1 +%define CONFIG_PCM_U32BE_ENCODER 1 +%define CONFIG_PCM_U32LE_ENCODER 1 +%define CONFIG_ROQ_DPCM_ENCODER 1 +%define CONFIG_ADPCM_ADX_ENCODER 1 +%define CONFIG_ADPCM_G722_ENCODER 1 +%define CONFIG_ADPCM_G726_ENCODER 1 +%define CONFIG_ADPCM_IMA_QT_ENCODER 1 +%define CONFIG_ADPCM_IMA_WAV_ENCODER 1 +%define CONFIG_ADPCM_MS_ENCODER 1 +%define CONFIG_ADPCM_SWF_ENCODER 1 +%define CONFIG_ADPCM_YAMAHA_ENCODER 1 +%define CONFIG_ASS_ENCODER 1 +%define CONFIG_DVBSUB_ENCODER 1 +%define CONFIG_DVDSUB_ENCODER 1 +%define CONFIG_XSUB_ENCODER 1 +%define CONFIG_LIBFAAC_ENCODER 0 +%define CONFIG_LIBFDK_AAC_ENCODER 0 +%define CONFIG_LIBGSM_ENCODER 0 +%define CONFIG_LIBGSM_MS_ENCODER 0 +%define CONFIG_LIBILBC_ENCODER 0 +%define CONFIG_LIBMP3LAME_ENCODER 0 +%define CONFIG_LIBOPENCORE_AMRNB_ENCODER 0 +%define CONFIG_LIBOPENJPEG_ENCODER 0 +%define CONFIG_LIBOPUS_ENCODER 0 +%define CONFIG_LIBSCHROEDINGER_ENCODER 0 +%define CONFIG_LIBSPEEX_ENCODER 0 +%define CONFIG_LIBTHEORA_ENCODER 0 +%define CONFIG_LIBVO_AACENC_ENCODER 0 +%define CONFIG_LIBVO_AMRWBENC_ENCODER 0 +%define CONFIG_LIBVORBIS_ENCODER 0 +%define CONFIG_LIBVPX_VP8_ENCODER 0 +%define CONFIG_LIBVPX_VP9_ENCODER 0 +%define CONFIG_LIBWAVPACK_ENCODER 0 +%define CONFIG_LIBWEBP_ENCODER 0 +%define CONFIG_LIBX264_ENCODER 0 +%define CONFIG_LIBX265_ENCODER 0 +%define CONFIG_LIBXAVS_ENCODER 0 +%define CONFIG_LIBXVID_ENCODER 0 +%define CONFIG_AFORMAT_FILTER 1 +%define CONFIG_AMIX_FILTER 1 +%define CONFIG_ANULL_FILTER 1 +%define CONFIG_ASETPTS_FILTER 1 +%define CONFIG_ASETTB_FILTER 1 +%define CONFIG_ASHOWINFO_FILTER 1 +%define CONFIG_ASPLIT_FILTER 1 +%define CONFIG_ASYNCTS_FILTER 1 +%define CONFIG_ATRIM_FILTER 1 +%define CONFIG_BS2B_FILTER 0 +%define CONFIG_CHANNELMAP_FILTER 1 +%define CONFIG_CHANNELSPLIT_FILTER 1 +%define CONFIG_COMPAND_FILTER 1 +%define CONFIG_JOIN_FILTER 1 +%define CONFIG_RESAMPLE_FILTER 1 +%define CONFIG_VOLUME_FILTER 1 +%define CONFIG_ANULLSRC_FILTER 1 +%define CONFIG_ANULLSINK_FILTER 1 +%define CONFIG_BLACKFRAME_FILTER 0 +%define CONFIG_BOXBLUR_FILTER 0 +%define CONFIG_COPY_FILTER 1 +%define CONFIG_CROP_FILTER 1 +%define CONFIG_CROPDETECT_FILTER 0 +%define CONFIG_DELOGO_FILTER 0 +%define CONFIG_DRAWBOX_FILTER 1 +%define CONFIG_DRAWTEXT_FILTER 0 +%define CONFIG_FADE_FILTER 1 +%define CONFIG_FIELDORDER_FILTER 1 +%define CONFIG_FORMAT_FILTER 1 +%define CONFIG_FPS_FILTER 1 +%define CONFIG_FRAMEPACK_FILTER 1 +%define CONFIG_FREI0R_FILTER 0 +%define CONFIG_GRADFUN_FILTER 1 +%define CONFIG_HFLIP_FILTER 1 +%define CONFIG_HQDN3D_FILTER 0 +%define CONFIG_INTERLACE_FILTER 0 +%define CONFIG_LUT_FILTER 1 +%define CONFIG_LUTRGB_FILTER 1 +%define CONFIG_LUTYUV_FILTER 1 +%define CONFIG_NEGATE_FILTER 1 +%define CONFIG_NOFORMAT_FILTER 1 +%define CONFIG_NULL_FILTER 1 +%define CONFIG_OCV_FILTER 0 +%define CONFIG_OVERLAY_FILTER 1 +%define CONFIG_PAD_FILTER 1 +%define CONFIG_PIXDESCTEST_FILTER 1 +%define CONFIG_SCALE_FILTER 1 +%define CONFIG_SELECT_FILTER 1 +%define CONFIG_SETDAR_FILTER 1 +%define CONFIG_SETPTS_FILTER 1 +%define CONFIG_SETSAR_FILTER 1 +%define CONFIG_SETTB_FILTER 1 +%define CONFIG_SHOWINFO_FILTER 1 +%define CONFIG_SHUFFLEPLANES_FILTER 1 +%define CONFIG_SPLIT_FILTER 1 +%define CONFIG_TRANSPOSE_FILTER 1 +%define CONFIG_TRIM_FILTER 1 +%define CONFIG_UNSHARP_FILTER 1 +%define CONFIG_VFLIP_FILTER 1 +%define CONFIG_YADIF_FILTER 1 +%define CONFIG_COLOR_FILTER 1 +%define CONFIG_FREI0R_SRC_FILTER 0 +%define CONFIG_MOVIE_FILTER 1 +%define CONFIG_NULLSRC_FILTER 1 +%define CONFIG_RGBTESTSRC_FILTER 1 +%define CONFIG_TESTSRC_FILTER 1 +%define CONFIG_NULLSINK_FILTER 1 +%define CONFIG_H263_VAAPI_HWACCEL 0 +%define CONFIG_H263_VDPAU_HWACCEL 0 +%define CONFIG_H264_DXVA2_HWACCEL 0 +%define CONFIG_H264_VAAPI_HWACCEL 0 +%define CONFIG_H264_VDA_HWACCEL 0 +%define CONFIG_H264_VDA_OLD_HWACCEL 0 +%define CONFIG_H264_VDPAU_HWACCEL 0 +%define CONFIG_MPEG1_VDPAU_HWACCEL 0 +%define CONFIG_MPEG2_DXVA2_HWACCEL 0 +%define CONFIG_MPEG2_VAAPI_HWACCEL 0 +%define CONFIG_MPEG2_VDPAU_HWACCEL 0 +%define CONFIG_MPEG4_VAAPI_HWACCEL 0 +%define CONFIG_MPEG4_VDPAU_HWACCEL 0 +%define CONFIG_VC1_DXVA2_HWACCEL 0 +%define CONFIG_VC1_VAAPI_HWACCEL 0 +%define CONFIG_VC1_VDPAU_HWACCEL 0 +%define CONFIG_WMV3_DXVA2_HWACCEL 0 +%define CONFIG_WMV3_VAAPI_HWACCEL 0 +%define CONFIG_WMV3_VDPAU_HWACCEL 0 +%define CONFIG_ALSA_INDEV 1 +%define CONFIG_BKTR_INDEV 0 +%define CONFIG_DV1394_INDEV 1 +%define CONFIG_FBDEV_INDEV 1 +%define CONFIG_JACK_INDEV 0 +%define CONFIG_OSS_INDEV 1 +%define CONFIG_PULSE_INDEV 0 +%define CONFIG_SNDIO_INDEV 0 +%define CONFIG_V4L2_INDEV 1 +%define CONFIG_VFWCAP_INDEV 0 +%define CONFIG_X11GRAB_INDEV 0 +%define CONFIG_LIBCDIO_INDEV 0 +%define CONFIG_LIBDC1394_INDEV 0 +%define CONFIG_A64_MUXER 1 +%define CONFIG_AC3_MUXER 1 +%define CONFIG_ADTS_MUXER 1 +%define CONFIG_ADX_MUXER 1 +%define CONFIG_AIFF_MUXER 1 +%define CONFIG_AMR_MUXER 1 +%define CONFIG_ASF_MUXER 1 +%define CONFIG_ASS_MUXER 1 +%define CONFIG_ASF_STREAM_MUXER 1 +%define CONFIG_AU_MUXER 1 +%define CONFIG_AVI_MUXER 1 +%define CONFIG_AVM2_MUXER 1 +%define CONFIG_CAVSVIDEO_MUXER 1 +%define CONFIG_CRC_MUXER 1 +%define CONFIG_DAUD_MUXER 1 +%define CONFIG_DIRAC_MUXER 1 +%define CONFIG_DNXHD_MUXER 1 +%define CONFIG_DTS_MUXER 1 +%define CONFIG_DV_MUXER 1 +%define CONFIG_EAC3_MUXER 1 +%define CONFIG_F4V_MUXER 1 +%define CONFIG_FFMETADATA_MUXER 1 +%define CONFIG_FILMSTRIP_MUXER 1 +%define CONFIG_FLAC_MUXER 1 +%define CONFIG_FLV_MUXER 1 +%define CONFIG_FRAMECRC_MUXER 1 +%define CONFIG_FRAMEMD5_MUXER 1 +%define CONFIG_G722_MUXER 1 +%define CONFIG_GIF_MUXER 1 +%define CONFIG_GXF_MUXER 1 +%define CONFIG_H261_MUXER 1 +%define CONFIG_H263_MUXER 1 +%define CONFIG_H264_MUXER 1 +%define CONFIG_HDS_MUXER 1 +%define CONFIG_HEVC_MUXER 1 +%define CONFIG_HLS_MUXER 1 +%define CONFIG_ILBC_MUXER 1 +%define CONFIG_IMAGE2_MUXER 1 +%define CONFIG_IMAGE2PIPE_MUXER 1 +%define CONFIG_IPOD_MUXER 1 +%define CONFIG_ISMV_MUXER 1 +%define CONFIG_IVF_MUXER 1 +%define CONFIG_LATM_MUXER 1 +%define CONFIG_M4V_MUXER 1 +%define CONFIG_MD5_MUXER 1 +%define CONFIG_MATROSKA_MUXER 1 +%define CONFIG_MATROSKA_AUDIO_MUXER 1 +%define CONFIG_MJPEG_MUXER 1 +%define CONFIG_MLP_MUXER 1 +%define CONFIG_MMF_MUXER 1 +%define CONFIG_MOV_MUXER 1 +%define CONFIG_MP2_MUXER 1 +%define CONFIG_MP3_MUXER 1 +%define CONFIG_MP4_MUXER 1 +%define CONFIG_MPEG1SYSTEM_MUXER 1 +%define CONFIG_MPEG1VCD_MUXER 1 +%define CONFIG_MPEG1VIDEO_MUXER 1 +%define CONFIG_MPEG2DVD_MUXER 1 +%define CONFIG_MPEG2SVCD_MUXER 1 +%define CONFIG_MPEG2VIDEO_MUXER 1 +%define CONFIG_MPEG2VOB_MUXER 1 +%define CONFIG_MPEGTS_MUXER 1 +%define CONFIG_MPJPEG_MUXER 1 +%define CONFIG_MXF_MUXER 1 +%define CONFIG_MXF_D10_MUXER 1 +%define CONFIG_NULL_MUXER 1 +%define CONFIG_NUT_MUXER 1 +%define CONFIG_OGG_MUXER 1 +%define CONFIG_OMA_MUXER 1 +%define CONFIG_PCM_ALAW_MUXER 1 +%define CONFIG_PCM_MULAW_MUXER 1 +%define CONFIG_PCM_F64BE_MUXER 1 +%define CONFIG_PCM_F64LE_MUXER 1 +%define CONFIG_PCM_F32BE_MUXER 1 +%define CONFIG_PCM_F32LE_MUXER 1 +%define CONFIG_PCM_S32BE_MUXER 1 +%define CONFIG_PCM_S32LE_MUXER 1 +%define CONFIG_PCM_S24BE_MUXER 1 +%define CONFIG_PCM_S24LE_MUXER 1 +%define CONFIG_PCM_S16BE_MUXER 1 +%define CONFIG_PCM_S16LE_MUXER 1 +%define CONFIG_PCM_S8_MUXER 1 +%define CONFIG_PCM_U32BE_MUXER 1 +%define CONFIG_PCM_U32LE_MUXER 1 +%define CONFIG_PCM_U24BE_MUXER 1 +%define CONFIG_PCM_U24LE_MUXER 1 +%define CONFIG_PCM_U16BE_MUXER 1 +%define CONFIG_PCM_U16LE_MUXER 1 +%define CONFIG_PCM_U8_MUXER 1 +%define CONFIG_PSP_MUXER 1 +%define CONFIG_RAWVIDEO_MUXER 1 +%define CONFIG_RM_MUXER 1 +%define CONFIG_ROQ_MUXER 1 +%define CONFIG_RSO_MUXER 1 +%define CONFIG_RTP_MUXER 1 +%define CONFIG_RTSP_MUXER 1 +%define CONFIG_SAP_MUXER 1 +%define CONFIG_SEGMENT_MUXER 1 +%define CONFIG_SMJPEG_MUXER 1 +%define CONFIG_SMOOTHSTREAMING_MUXER 1 +%define CONFIG_SOX_MUXER 1 +%define CONFIG_SPDIF_MUXER 1 +%define CONFIG_SRT_MUXER 1 +%define CONFIG_SWF_MUXER 1 +%define CONFIG_TG2_MUXER 1 +%define CONFIG_TGP_MUXER 1 +%define CONFIG_TRUEHD_MUXER 1 +%define CONFIG_VC1T_MUXER 1 +%define CONFIG_VOC_MUXER 1 +%define CONFIG_WAV_MUXER 1 +%define CONFIG_WEBM_MUXER 1 +%define CONFIG_WV_MUXER 1 +%define CONFIG_YUV4MPEGPIPE_MUXER 1 +%define CONFIG_ALSA_OUTDEV 1 +%define CONFIG_OSS_OUTDEV 1 +%define CONFIG_SNDIO_OUTDEV 0 +%define CONFIG_AAC_PARSER 1 +%define CONFIG_AAC_LATM_PARSER 1 +%define CONFIG_AC3_PARSER 1 +%define CONFIG_ADX_PARSER 1 +%define CONFIG_BMP_PARSER 1 +%define CONFIG_CAVSVIDEO_PARSER 1 +%define CONFIG_COOK_PARSER 1 +%define CONFIG_DCA_PARSER 1 +%define CONFIG_DIRAC_PARSER 1 +%define CONFIG_DNXHD_PARSER 1 +%define CONFIG_DVBSUB_PARSER 1 +%define CONFIG_DVDSUB_PARSER 1 +%define CONFIG_FLAC_PARSER 1 +%define CONFIG_GSM_PARSER 1 +%define CONFIG_H261_PARSER 1 +%define CONFIG_H263_PARSER 1 +%define CONFIG_H264_PARSER 1 +%define CONFIG_HEVC_PARSER 1 +%define CONFIG_MJPEG_PARSER 1 +%define CONFIG_MLP_PARSER 1 +%define CONFIG_MPEG4VIDEO_PARSER 1 +%define CONFIG_MPEGAUDIO_PARSER 1 +%define CONFIG_MPEGVIDEO_PARSER 1 +%define CONFIG_OPUS_PARSER 1 +%define CONFIG_PNG_PARSER 1 +%define CONFIG_PNM_PARSER 1 +%define CONFIG_RV30_PARSER 1 +%define CONFIG_RV40_PARSER 1 +%define CONFIG_TAK_PARSER 1 +%define CONFIG_VC1_PARSER 1 +%define CONFIG_VORBIS_PARSER 1 +%define CONFIG_VP3_PARSER 1 +%define CONFIG_VP8_PARSER 1 +%define CONFIG_CONCAT_PROTOCOL 1 +%define CONFIG_CRYPTO_PROTOCOL 1 +%define CONFIG_FFRTMPCRYPT_PROTOCOL 0 +%define CONFIG_FFRTMPHTTP_PROTOCOL 1 +%define CONFIG_FILE_PROTOCOL 1 +%define CONFIG_GOPHER_PROTOCOL 1 +%define CONFIG_HLS_PROTOCOL 1 +%define CONFIG_HTTP_PROTOCOL 1 +%define CONFIG_HTTPPROXY_PROTOCOL 1 +%define CONFIG_HTTPS_PROTOCOL 0 +%define CONFIG_MMSH_PROTOCOL 1 +%define CONFIG_MMST_PROTOCOL 1 +%define CONFIG_MD5_PROTOCOL 1 +%define CONFIG_PIPE_PROTOCOL 1 +%define CONFIG_RTMP_PROTOCOL 1 +%define CONFIG_RTMPE_PROTOCOL 0 +%define CONFIG_RTMPS_PROTOCOL 0 +%define CONFIG_RTMPT_PROTOCOL 1 +%define CONFIG_RTMPTE_PROTOCOL 0 +%define CONFIG_RTMPTS_PROTOCOL 0 +%define CONFIG_RTP_PROTOCOL 1 +%define CONFIG_SCTP_PROTOCOL 0 +%define CONFIG_SRTP_PROTOCOL 1 +%define CONFIG_TCP_PROTOCOL 1 +%define CONFIG_TLS_PROTOCOL 0 +%define CONFIG_UDP_PROTOCOL 1 +%define CONFIG_UNIX_PROTOCOL 1 +%define CONFIG_LIBRTMP_PROTOCOL 0 +%define CONFIG_LIBRTMPE_PROTOCOL 0 +%define CONFIG_LIBRTMPS_PROTOCOL 0 +%define CONFIG_LIBRTMPT_PROTOCOL 0 +%define CONFIG_LIBRTMPTE_PROTOCOL 0 diff --git a/src/expansion.asm b/src/expansion.asm new file mode 100644 index 0000000..4ac77bf --- /dev/null +++ b/src/expansion.asm @@ -0,0 +1,91 @@ +%include "x86util.asm" + +SECTION .text + +; len1 len2 +; compute vec2^T·mat·vec1 = ∑ ∑ mat[i, j] vec1[i] vec2[j] +; i=1 j=1 +%macro SCALARPRODUCT_METRIC 0 +cglobal scalarproduct_metric, 5, 7, 7, len1, len2, mat, vec1, vec2, rowpos + shl len2q, 3 + shl len1q, 3 + + add vec1q, len1q + add vec2q, len2q + neg len2q + + lea r6, [3 * len1q] + + xorpd m0, m0 + +.loop_2 + mov rowposq, len1q + neg rowposq + + xorpd m1, m1 + xorpd m2, m2 + +%if mmsize == 32 + xorpd m3, m3 + xorpd m4, m4 +%endif + +.loop_1 + mova m5, [vec1q + rowposq] + +%if mmsize == 32 + FMULADD_PD m4, m5, [matq + r6q], m4, m6 + FMULADD_PD m3, m5, [matq + 2 * len1q], m3, m6 +%endif + + FMULADD_PD m2, m5, [matq + 1 * len1q], m2, m6 + FMULADD_PD m1, m5, [matq + 0 * len1q], m1, m6 + + add matq, mmsize + add rowposq, mmsize + js .loop_1 + + haddpd m1, m2 + +%if mmsize == 32 + vextractf128 xmm2, ymm1, 1 + addpd xmm1, xmm2 + + haddpd m3, m4 + vextractf128 xmm4, ymm3, 1 + addpd xmm3, xmm4 + + vinsertf128 ymm1, ymm1, xmm3, 1 +%endif + + FMULADD_PD m0, m1, [vec2q + len2q], m0, m6 + +%if mmsize == 32 + add matq, r6 +%else + add matq, len1q +%endif + add len2q, mmsize + js .loop_2 + + haddpd m0, m0 + +%if mmsize == 32 + vextractf128 xmm1, ymm0, 1 + addpd xmm0, xmm1 +%endif + + emms + + RET +%endmacro + +INIT_XMM sse3 +SCALARPRODUCT_METRIC + +INIT_YMM avx +SCALARPRODUCT_METRIC + +INIT_YMM fma3 +SCALARPRODUCT_METRIC + diff --git a/src/gamma_freeze_template.c b/src/gamma_freeze_template.c new file mode 100644 index 0000000..8edda4d --- /dev/null +++ b/src/gamma_freeze_template.c @@ -0,0 +1,507 @@ +/* + * Minimal distortion -- template for the equations definitions + * Copyright (C) 2016 Anton Khirnov <anton@khirnov.net> + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#define FUNC3(a, b) a ## _ ## b +#define FUNC2(a, b) FUNC3(a, b) +#define FUNC(name) FUNC2(name, EQUATION) + +/** + * A template for calculating the equation coefficients. + */ +static void FUNC(calc_eq_coeffs)(void *arg, + unsigned int job_idx, unsigned int nb_jobs, + unsigned int thread_idx, unsigned int nb_threads) +{ + const MDCalcEqThread *et = arg; + const MDSolver *ctx = et->ctx; + MDEquationContext *eq_ctx = et->eq_ctx; + + const int start = job_idx * et->block_size; + const int end = MIN((job_idx + 1) * et->block_size, NB_COLLOC_POINTS(ctx)); + + for (int i = start; i < end; i++) { + const double x = eq_ctx->interp_coords[0][i]; + const double z = eq_ctx->interp_coords[2][i]; + const int zaxis = x <= EPS; + + double c1o3 = (1.0 / 3.0); + + double gtu[3][3], g[3][3], gu[3][3]; + double dg[3][3][3], d2g[3][3][3][3], dgu[3][3][3], dgtu[3][3][3], G[3][3][3], dG[3][3][3][3]; + double Gt[3][3][3]; + double dXt[3][3]; + double A[3][3], Au[3][3], Atu[3][3]; + double dA[3][3][3], dAu[3][3][3]; + double Ric[3][3], Ricm[3][3]; + double rhs_x, rhs_z; + + const double gtxx = eq_ctx->interp_values[I_GTXX][i]; + const double gtyy = eq_ctx->interp_values[I_GTYY][i]; + const double gtzz = eq_ctx->interp_values[I_GTZZ][i]; + const double gtxy = eq_ctx->interp_values[I_GTXY][i]; + const double gtxz = eq_ctx->interp_values[I_GTXZ][i]; + const double gtyz = eq_ctx->interp_values[I_GTYZ][i]; + + const double gt[3][3] = {{ gtxx, gtxy, gtxz }, + { gtxy, gtyy, gtyz }, + { gtxz, gtyz, gtzz }}; + + const double dx_gt11 = eq_ctx->interp_values[I_GTXX_DX][i]; + const double dx_gt22 = eq_ctx->interp_values[I_GTYY_DX][i]; + const double dx_gt33 = eq_ctx->interp_values[I_GTZZ_DX][i]; + const double dx_gt13 = eq_ctx->interp_values[I_GTXZ_DX][i]; + + const double dz_gt11 = eq_ctx->interp_values[I_GTXX_DZ][i]; + const double dz_gt22 = eq_ctx->interp_values[I_GTYY_DZ][i]; + const double dz_gt33 = eq_ctx->interp_values[I_GTZZ_DZ][i]; + const double dz_gt13 = eq_ctx->interp_values[I_GTXZ_DZ][i]; + + const double dgt[3][3][3] = { + { + { dx_gt11, 0.0, dx_gt13 }, + { 0.0, dx_gt22, 0.0 }, + { dx_gt13, 0.0, dx_gt33 }, + }, + { + { 0.0, zaxis ? dx_gt11 - dx_gt22 : (gtxx - gtyy) / x, 0.0 }, + { zaxis ? dx_gt11 - dx_gt22 : (gtxx - gtyy) / x, 0.0, zaxis ? dx_gt13 : gtxz / x }, + { 0.0, zaxis ? dx_gt13 : gtxz / x, 0.0 }, + }, + { + { dz_gt11, 0.0, dz_gt13 }, + { 0.0, dz_gt22, 0.0 }, + { dz_gt13, 0.0, dz_gt33 }, + }, + }; + + const double dxx_gt11 = eq_ctx->interp_values[I_GTXX_DXX][i]; + const double dxx_gt22 = eq_ctx->interp_values[I_GTYY_DXX][i]; + const double dxx_gt33 = eq_ctx->interp_values[I_GTZZ_DXX][i]; + const double dxx_gt13 = eq_ctx->interp_values[I_GTXZ_DXX][i]; + + const double dxz_gt11 = eq_ctx->interp_values[I_GTXX_DXZ][i]; + const double dxz_gt22 = eq_ctx->interp_values[I_GTYY_DXZ][i]; + const double dxz_gt33 = eq_ctx->interp_values[I_GTZZ_DXZ][i]; + const double dxz_gt13 = eq_ctx->interp_values[I_GTXZ_DXZ][i]; + + const double dzz_gt11 = eq_ctx->interp_values[I_GTXX_DZZ][i]; + const double dzz_gt22 = eq_ctx->interp_values[I_GTYY_DZZ][i]; + const double dzz_gt33 = eq_ctx->interp_values[I_GTZZ_DZZ][i]; + const double dzz_gt13 = eq_ctx->interp_values[I_GTXZ_DZZ][i]; + + const double d2gt[3][3][3][3] = { + { + { + { dxx_gt11, 0.0, dxx_gt13 }, + { 0.0, dxx_gt22, 0.0 }, + { dxx_gt13, 0.0, dxx_gt33 }, + }, + { + { 0.0, zaxis ? 0.5 * (dxx_gt11 - dxx_gt22) : (dx_gt11 - dx_gt22) / x - (gtxx - gtyy) / SQR(x), 0.0 }, + { zaxis ? 0.5 * (dxx_gt11 - dxx_gt22) : (dx_gt11 - dx_gt22) / x - (gtxx - gtyy) / SQR(x), 0.0, + zaxis ? 0.5 * dxx_gt13 : dx_gt13 / x - gtxz / SQR(x) }, + { 0.0, zaxis ? 0.5 * dxx_gt13 : dx_gt13 / x - gtxz / SQR(x), 0.0 }, + }, + { + { dxz_gt11, 0.0, dxz_gt13 }, + { 0.0, dxz_gt22, 0.0 }, + { dxz_gt13, 0.0, dxz_gt33 }, + }, + + }, + { + { + { 0.0, zaxis ? 0.5 * (dxx_gt11 - dxx_gt22) : (dx_gt11 - dx_gt22) / x - (gtxx - gtyy) / SQR(x), 0.0 }, + { zaxis ? 0.5 * (dxx_gt11 - dxx_gt22) : (dx_gt11 - dx_gt22) / x - (gtxx - gtyy) / SQR(x), 0.0, + zaxis ? 0.5 * dxx_gt13 : dx_gt13 / x - gtxz / SQR(x) }, + { 0.0, zaxis ? 0.5 * dxx_gt13 : dx_gt13 / x - gtxz / SQR(x), 0.0 }, + }, + { + { zaxis ? dxx_gt22 : dx_gt11 / x - 2 * (gtxx - gtyy) / SQR(x), 0.0, + zaxis ? 0.5 * dxx_gt13 : dx_gt13 / x - gtxz / SQR(x) }, + { 0.0, zaxis ? dxx_gt11 : dx_gt22 / x + 2.0 * (gtxx - gtyy) / SQR(x), 0.0 }, + { zaxis ? 0.5 * dxx_gt13 : dx_gt13 / x - gtxz / SQR(x), 0.0, zaxis ? dxx_gt33 : dx_gt33 / x }, + }, + { + { 0.0, zaxis ? dxz_gt11 - dxz_gt22 : (dz_gt11 - dz_gt22) / x, 0.0 }, + { zaxis ? dxz_gt11 - dxz_gt22 : (dz_gt11 - dz_gt22) / x, 0.0, + zaxis ? dxz_gt13 : dz_gt13 / x }, + { 0.0, zaxis ? dxz_gt13 : dz_gt13 / x, 0.0 }, + }, + + }, + { + { + { dxz_gt11, 0.0, dxz_gt13 }, + { 0.0, dxz_gt22, 0.0 }, + { dxz_gt13, 0.0, dxz_gt33 }, + }, + { + { 0.0, zaxis ? dxz_gt11 - dxz_gt22 : (dz_gt11 - dz_gt22) / x, 0.0 }, + { zaxis ? dxz_gt11 - dxz_gt22 : (dz_gt11 - dz_gt22) / x, 0.0, + zaxis ? dxz_gt13 : dz_gt13 / x }, + { 0.0, zaxis ? dxz_gt13 : dz_gt13 / x, 0.0 }, + }, + { + { dzz_gt11, 0.0, dzz_gt13 }, + { 0.0, dzz_gt22, 0.0 }, + { dzz_gt13, 0.0, dzz_gt33 }, + }, + + }, + }; + + const double Atxx = eq_ctx->interp_values[I_ATXX][i]; + const double Atyy = eq_ctx->interp_values[I_ATYY][i]; + const double Atzz = eq_ctx->interp_values[I_ATZZ][i]; + const double Atxy = eq_ctx->interp_values[I_ATXY][i]; + const double Atxz = eq_ctx->interp_values[I_ATXZ][i]; + const double Atyz = eq_ctx->interp_values[I_ATYZ][i]; + + const double trK = eq_ctx->interp_values[I_TRK][i]; + + const double dx_trK = eq_ctx->interp_values[I_TRK_DX][i]; + const double dz_trK = eq_ctx->interp_values[I_TRK_DZ][i]; + + const double dtrK[3] = { dx_trK, 0.0, dz_trK }; + + const double dx_At11 = eq_ctx->interp_values[I_ATXX_DX][i]; + const double dx_At22 = eq_ctx->interp_values[I_ATYY_DX][i]; + const double dx_At33 = eq_ctx->interp_values[I_ATZZ_DX][i]; + const double dx_At13 = eq_ctx->interp_values[I_ATXZ_DX][i]; + + const double dz_At11 = eq_ctx->interp_values[I_ATXX_DZ][i]; + const double dz_At22 = eq_ctx->interp_values[I_ATYY_DZ][i]; + const double dz_At33 = eq_ctx->interp_values[I_ATZZ_DZ][i]; + const double dz_At13 = eq_ctx->interp_values[I_ATXZ_DZ][i]; + + const double dAt[3][3][3] = { + { + { dx_At11, 0.0, dx_At13 }, + { 0.0, dx_At22, 0.0 }, + { dx_At13, 0.0, dx_At33 }, + }, + { + { 0.0, zaxis ? dx_At11 - dx_At22 : (Atxx - Atyy) / x, 0.0 }, + { zaxis ? dx_At11 - dx_At22 : (Atxx - Atyy) / x, 0.0, zaxis ? dx_At13 : Atxz / x }, + { 0.0, zaxis ? dx_At13 : Atxz / x, 0.0 }, + }, + { + { dz_At11, 0.0, dz_At13 }, + { 0.0, dz_At22, 0.0 }, + { dz_At13, 0.0, dz_At33 }, + }, + }; + + const double phi = eq_ctx->interp_values[I_PHI][i]; + + const double phi_dx = eq_ctx->interp_values[I_PHI_DX][i]; + const double phi_dz = eq_ctx->interp_values[I_PHI_DZ][i]; + + const double dphi[3] = { phi_dx, 0.0, phi_dz }; + + const double phi_dxx = eq_ctx->interp_values[I_PHI_DXX][i]; + const double phi_dzz = eq_ctx->interp_values[I_PHI_DZZ][i]; + const double phi_dxz = eq_ctx->interp_values[I_PHI_DXZ][i]; + + const double d2phi[3][3] = { + { phi_dxx, 0.0, phi_dxz }, + { 0.0, zaxis ? phi_dxx : phi_dx / x, 0.0 }, + { phi_dxz, 0.0, phi_dzz }, + }; + + const double At[3][3] = {{ Atxx, Atxy, Atxz }, + { Atxy, Atyy, Atyz }, + { Atxz, Atyz, Atzz }}; + + const double alpha = eq_ctx->interp_values[I_ALPHA][i]; + const double dx_alpha = eq_ctx->interp_values[I_ALPHA_DX][i]; + const double dz_alpha = eq_ctx->interp_values[I_ALPHA_DZ][i]; + + const double dalpha[3] = { dx_alpha, 0.0, dz_alpha }; + + const double Xtx = eq_ctx->interp_values[I_XTX][i]; + const double Xtz = eq_ctx->interp_values[I_XTZ][i]; + + const double Xt[3] = { Xtx, 0.0, Xtz }; + + const double det = gtxx * gtyy * gtzz + 2 * gtxy * gtyz * gtxz - gtzz * SQR(gtxy) - SQR(gtxz) * gtyy - gtxx * SQR(gtyz); + + // \tilde{γ}^{ij} + gtu[0][0] = (gtyy * gtzz - SQR(gtyz)) / det; + gtu[1][1] = (gtxx * gtzz - SQR(gtxz)) / det; + gtu[2][2] = (gtxx * gtyy - SQR(gtxy)) / det; + gtu[0][1] = -(gtxy * gtzz - gtyz * gtxz) / det; + gtu[0][2] = (gtxy * gtyz - gtyy * gtxz) / det; + gtu[1][2] = -(gtxx * gtyz - gtxy * gtxz) / det; + gtu[1][0] = gtu[0][1]; + gtu[2][0] = gtu[0][2]; + gtu[2][1] = gtu[1][2]; + + // γ_{jk}/^{jk} + for (int j = 0; j < 3; j++) + for (int k = 0; k < 3; k++) { + gu[j][k] = SQR(phi) * gtu[j][k]; + g[j][k] = gt[j][k] / SQR(phi); + } + + // ∂_j γ_{kl} + for (int j = 0; j < 3; j++) + for (int k = 0; k < 3; k++) + for (int l = 0; l < 3; l++) { + dg[j][k][l] = -2.0 * dphi[j] * gt[k][l] / (phi * SQR(phi)) + dgt[j][k][l] / SQR(phi); + dA[j][k][l] = -2.0 * dphi[j] * At[k][l] / (phi * SQR(phi)) + dAt[j][k][l] / SQR(phi); + } + + // ∂_j \tilde{γ}^{kl} + for (int j = 0; j < 3; j++) + for (int k = 0; k < 3; k++) + for (int l = 0; l < 3; l++) { + double val = 0.0; + for (int m = 0; m < 3; m++) + for (int n = 0; n < 3; n++) + val += -gtu[k][m] * gtu[l][n] * dgt[j][m][n]; + dgtu[j][k][l] = val; + } + + // ∂_j γ^{kl} + for (int j = 0; j < 3; j++) + for (int k = 0; k < 3; k++) + for (int l = 0; l < 3; l++) { + double val = 0.0; + for (int m = 0; m < 3; m++) + for (int n = 0; n < 3; n++) + val += -gu[k][m] * gu[l][n] * dg[j][m][n]; + dgu[j][k][l] = val; + } + + // ∂_{jk} g_{lm} + for (int j = 0; j < 3; j++) + for (int k = 0; k < 3; k++) + for (int l = 0; l < 3; l++) + for (int m = 0; m < 3; m++) { + d2g[j][k][l][m] = 6.0 * gt [l][m] * dphi[j] * dphi[k] / SQR(SQR(phi)) - + 2.0 * gt [l][m] * d2phi[j][k] / (phi * SQR(phi)) - + 2.0 * dgt [j][l][m] * dphi[k] / (phi * SQR(phi)) - + 2.0 * dgt [k][l][m] * dphi[j] / (phi * SQR(phi)) + + d2gt[j][k][l][m] / SQR(phi); + } + + // \tilde{Γ}^j_{kl} + for (int j = 0; j < 3; j++) + for (int k = 0; k < 3; k++) + for (int l = 0; l < 3; l++) { + double val = 0.0; + for (int m = 0; m < 3; m++) + val += 0.5 * gtu[j][m] * (dgt[k][l][m] + dgt[l][k][m] - dgt[m][k][l]); + Gt[j][k][l] = val; + } + + // Γ^j_{kl} + for (int j = 0; j < 3; j++) + for (int k = 0; k < 3; k++) + for (int l = 0; l < 3; l++) { + double val = 0.0; + for (int m = 0; m < 3; m++) + val += 0.5 * gu[j][m] * (dg[k][l][m] + dg[l][k][m] - dg[m][k][l]); + G[j][k][l] = val; + } + + // ∂_j Γ^k_{lm} + for (int j = 0; j < 3; j++) + for (int k = 0; k < 3; k++) + for (int l = 0; l < 3; l++) + for (int m = 0; m < 3; m++) { + double val = 0.0; + for (int n = 0; n < 3; n++) { + val += dgu[j][k][n] * (dg [l][m][n] + dg [m][l][n] - dg [n][l][m]) + + gu [k][n] * (d2g[j][l][m][n] + d2g[j][m][l][n] - d2g[j][n][l][m]); + } + dG[j][k][l][m] = 0.5 * val; + } + + // ∂_j Γ^k + for (int j = 0; j < 3; j++) + for (int k = 0; k < 3; k++) { + double val = 0.0; + for (int l = 0; l < 3; l++) + for (int m = 0; m < 3; m++) + val += gtu[l][m] * dG[j][k][l][m] + dgtu[j][l][m] * G[k][l][m]; + dXt[j][k] = val; + } + + // Ric_{jk} + for (int j = 0; j < 3; j++) + for (int k = 0; k < 3; k++) { + double val = 0.0; + for (int m = 0; m < 3; m++) + val += dG[m][m][j][k] - dG[k][m][j][m]; + for (int m = 0; m < 3; m++) + for (int l = 0; l < 3; l++) + val += G[l][l][m] * G[m][j][k] - G[l][k][m] * G[m][j][l]; + Ric[j][k] = val; + } + + // Ric^j_k + for (int j = 0; j < 3; j++) + for (int k = 0; k < 3; k++) { + double val = 0.0; + for (int l = 0; l < 3; l++) + val += gu[j][l] * Ric[l][k]; + Ricm[j][k] = val; + } + + // A_{jk} + for (int j = 0; j < 3; j++) + for (int k = 0; k < 3; k++) { + A[j][k] = At[j][k] / SQR(phi); + } + + // d_j A^{kl} + for (int j = 0; j < 3; j++) + for (int k = 0; k < 3; k++) + for (int l = 0; l < 3; l++) { + double val = 0.0; + for (int m = 0; m < 3; m++) + for (int n = 0; n < 3; n++) + val += dgu[j][k][m] * gu[l][n] * A[m][n] + gu[k][m] * dgu[j][l][n] * A[m][n] + gu[k][m] * gu[l][n] * dA[j][m][n]; + dAu[j][k][l] = val; + } + + // \tilde{A}^{jk} + for (int j = 0; j < 3; j++) + for (int k = 0; k < 3; k++) { + double val = 0.0; + for (int l = 0; l < 3; l++) + for (int m = 0; m < 3; m++) + val += gtu[j][l] * gtu[k][m] * At[l][m]; + Atu[j][k] = val; + } + + // A^{jk} + for (int j = 0; j < 3; j++) + for (int k = 0; k < 3; k++) { + double val = 0.0; + for (int l = 0; l < 3; l++) + for (int m = 0; m < 3; m++) + val += gu[j][l] * gu[k][m] * A[l][m]; + Au[j][k] = val; + } + + rhs_x = 0.0; + rhs_z = 0.0; + for (int j = 0; j < 3; j++) { + rhs_x += dalpha[j] * Atu[0][j]; + rhs_z += dalpha[j] * Atu[2][j]; + } + double val_x = 0.0; + double val_z = 0.0; + for (int j = 0; j < 3; j++) { + for (int k = 0; k < 3; k++) { + val_x += -Gt[0][j][k] * Atu[j][k]; + val_z += -Gt[2][j][k] * Atu[j][k]; + } + } + rhs_x += val_x * alpha; + rhs_z += val_z * alpha; + for (int j = 0; j < 3; j++) { + rhs_x += alpha * (2.0 / 3.0) * gtu[0][j] * dtrK[j]; + rhs_z += alpha * (2.0 / 3.0) * gtu[2][j] * dtrK[j]; + } + for (int j = 0; j < 3; j++) { + rhs_x += alpha * 3.0 * Atu[0][j] * dphi[j]/ phi; + rhs_z += alpha * 3.0 * Atu[2][j] * dphi[j]/ phi; + } + + rhs_x *= 2.0; + rhs_z *= 2.0; + + double X[3] = { 0.0 }; + for (int j = 0; j < 3; j++) + for (int k = 0; k < 3; k++) { + X[0] += gu[j][k] * G[0][j][k]; + X[2] += gu[j][k] * G[2][j][k]; + } + + if (EQUATION == 0) { + /* eq 0 */ + /* ∂_{xx}β^x */ + eq_ctx->eq_coeffs[0][PSSOLVE_DIFF_ORDER_20][i] = gtu[0][0] + c1o3 * gtu[0][0] + (zaxis ? 0.5 * (gtu[1][1] + c1o3 * gtu[0][0]) : 0.0); + /* ∂_{xx}β^z */ + eq_ctx->eq_coeffs[1][PSSOLVE_DIFF_ORDER_20][i] = 0.0; + /* ∂_{zz}β^x */ + eq_ctx->eq_coeffs[0][PSSOLVE_DIFF_ORDER_02][i] = gtu[2][2]; + /* ∂_{zz}β^z */ + eq_ctx->eq_coeffs[1][PSSOLVE_DIFF_ORDER_02][i] = c1o3 * gtu[0][2]; + + /* ∂_{xz}β^x */ + eq_ctx->eq_coeffs[0][PSSOLVE_DIFF_ORDER_11][i] = 2.0 * gtu[0][2] + c1o3 * gtu[0][2] + (zaxis ? c1o3 * gtu[0][2] : 0.0); + /* ∂_{xz}β^z */ + eq_ctx->eq_coeffs[1][PSSOLVE_DIFF_ORDER_11][i] = c1o3 * gtu[0][0]; + + /* ∂_{x}β^x */ + eq_ctx->eq_coeffs[0][PSSOLVE_DIFF_ORDER_10][i] = -Xt[0] + (2.0 / 3.0) * Xt[0] + (zaxis ? (2.0 / 3.0) * Xt[0] : (gtu[1][1] + c1o3 * gtu[0][0]) / x); + /* ∂_{x}β^z */ + eq_ctx->eq_coeffs[1][PSSOLVE_DIFF_ORDER_10][i] = 0.0; + + /* ∂_{z}β^x */ + eq_ctx->eq_coeffs[0][PSSOLVE_DIFF_ORDER_01][i] = -Xt[2] + (zaxis ? 0.0 : c1o3 * gtu[0][2] / x); + /* ∂_{z}β^z */ + eq_ctx->eq_coeffs[1][PSSOLVE_DIFF_ORDER_01][i] = (2.0 / 3.0) * Xt[0]; + + /* β^x */ + eq_ctx->eq_coeffs[0][PSSOLVE_DIFF_ORDER_00][i] = dXt[0][0] + (zaxis ? 0.0 : (2.0 / 3.0) * Xt[0] / x - (gtu[1][1] + c1o3 * gtu[0][0]) / SQR(x)); + + /* β^z */ + eq_ctx->eq_coeffs[1][PSSOLVE_DIFF_ORDER_00][i] = dXt[2][0]; + + eq_ctx->rhs[i] = rhs_x; + } else { + /* eq 1 */ + /* ∂_{xx}β^x */ + eq_ctx->eq_coeffs[0][PSSOLVE_DIFF_ORDER_20][i] = c1o3 * gtu[2][0] + (zaxis ? 0.5 * c1o3 * gtu[2][0] : 0.0); + /* ∂_{xx}β^z */ + eq_ctx->eq_coeffs[1][PSSOLVE_DIFF_ORDER_20][i] = gtu[0][0] + (zaxis ? gtu[1][1] : 0.0); + /* ∂_{zz}β^x */ + eq_ctx->eq_coeffs[0][PSSOLVE_DIFF_ORDER_02][i] = 0.0; + /* ∂_{zz}β^z */ + eq_ctx->eq_coeffs[1][PSSOLVE_DIFF_ORDER_02][i] = gtu[2][2] + c1o3 * gtu[2][2]; + /* ∂_{xz}β^x */ + eq_ctx->eq_coeffs[0][PSSOLVE_DIFF_ORDER_11][i] = c1o3 * gtu[2][2] + (zaxis ? c1o3 * gtu[2][2] : 0.0); + /* ∂_{xz}β^z */ + eq_ctx->eq_coeffs[1][PSSOLVE_DIFF_ORDER_11][i] = 2.0 * gtu[0][2] + c1o3 * gtu[0][2]; + + /* ∂_{x}β^x */ + eq_ctx->eq_coeffs[0][PSSOLVE_DIFF_ORDER_10][i] = (2.0 / 3.0) * Xt[2] + (zaxis ? (2.0 / 3.0) * Xt[2] : c1o3 * gtu[2][0] / x); + /* ∂_{x}β^z */ + eq_ctx->eq_coeffs[1][PSSOLVE_DIFF_ORDER_10][i] = -Xt[0] + (zaxis ? 0.0 : gtu[1][1] / x); + /* ∂_{z}β^x */ + eq_ctx->eq_coeffs[0][PSSOLVE_DIFF_ORDER_01][i] = (zaxis ? 0.0 : c1o3 * gtu[2][2] / x); + /* ∂_{z}β^z */ + eq_ctx->eq_coeffs[1][PSSOLVE_DIFF_ORDER_01][i] = -Xt[2] + (2.0 / 3.0) * Xt[2]; + + /* β^x */ + eq_ctx->eq_coeffs[0][PSSOLVE_DIFF_ORDER_00][i] = dXt[0][2] + (zaxis ? 0.0 : (2.0 / 3.0) * Xt[2] / x - c1o3 * gtu[2][0] / SQR(x)); + + /* β^z */ + eq_ctx->eq_coeffs[1][PSSOLVE_DIFF_ORDER_00][i] = dXt[2][2]; + + eq_ctx->rhs[i] = rhs_z; + } + } +} diff --git a/src/make.code.defn b/src/make.code.defn new file mode 100644 index 0000000..cc89085 --- /dev/null +++ b/src/make.code.defn @@ -0,0 +1,7 @@ +# Main make.code.defn file for thorn MaximalSlicingAxi + +# Source files in this directory +SRCS = basis.c bicgstab.c md.c md_solve.c pssolve.c expansion.asm threadpool.c register.c + +# Subdirectories containing source files +SUBDIRS = diff --git a/src/md.c b/src/md.c new file mode 100644 index 0000000..21e38fc --- /dev/null +++ b/src/md.c @@ -0,0 +1,573 @@ +#include "common.h" + +#include <ctype.h> +#include <errno.h> +#include <float.h> +#include <inttypes.h> +#include <limits.h> +#include <math.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include <cblas.h> + +#include "cctk.h" +#include "cctk_Arguments.h" +#include "cctk_Parameters.h" +#include "cctk_Timers.h" +#include "util_Table.h" + +#include "md.h" +#include "md_solve.h" +#include "threadpool.h" + +typedef struct EvalContext { + struct MDContext *md; + struct CoordPatch *cp; + const double *x; + const double *z; + double *W; + + const double *coeffs; + double nb_coeffs[2]; + + double *eval_tmp[2]; + + unsigned int x_idx_start; + unsigned int x_idx_end; + unsigned int z_idx_start; + unsigned int z_idx_end; +} EvalContext; + +/* precomputed values for a given refined grid */ +typedef struct CoordPatch { + CCTK_REAL origin[3]; + CCTK_INT delta[3]; + CCTK_INT size[3]; + + // basis values on the grid + double *basis_val_r; + double *basis_val_z; + + double *transform_z; + double *transform_matrix; + double *transform_matrix1; + double *transform_matrix2; + double *transform_matrix3; + double *transform_tmp; + + int y_idx; + + int nb_threads; + ThreadPoolContext *tp; + EvalContext *ec; +} CoordPatch; + +struct MDContext { + MDSolver *solver; + cGH *gh; + ThreadPoolContext *tp; + + struct { + double time; + double *coeffs; + } solution_cache[8]; + int nb_solutions; + + double *coeffs_eval; + + uint64_t grid_expand_count; + uint64_t grid_expand_time; + + CoordPatch *patches; + int nb_patches; +}; + +/* get an approximate "main" frequency component in a basis function */ +static double calc_basis_freq(const MDBasisSetContext *b, int order) +{ + return md_basis_colloc_point(b, order, 1); +} + +static CoordPatch *get_coord_patch(MDContext *md, + CCTK_REAL *x, CCTK_REAL *y, CCTK_REAL *z, + double scale_factor, double scale_power) +{ + cGH *cctkGH = md->gh; + + CoordPatch *cp; + int64_t grid_size; + int i, block_size; + const char *nb_threads; + + for (int i = 0; i < md->nb_patches; i++) { + cp = &md->patches[i]; + + if (cp->origin[0] == md->gh->cctk_origin_space[0] && + cp->origin[1] == md->gh->cctk_origin_space[1] && + cp->origin[2] == md->gh->cctk_origin_space[2] && + cp->size[0] == md->gh->cctk_lsh[0] && + cp->size[1] == md->gh->cctk_lsh[1] && + cp->size[2] == md->gh->cctk_lsh[2] && + cp->delta[0] == md->gh->cctk_levfac[0] && + cp->delta[1] == md->gh->cctk_levfac[1] && + cp->delta[2] == md->gh->cctk_levfac[2]) + return cp; + } + + grid_size = cctkGH->cctk_lsh[0] * cctkGH->cctk_lsh[1] * cctkGH->cctk_lsh[2]; + + /* create a new patch */ + md->patches = realloc(md->patches, sizeof(*md->patches) * (md->nb_patches + 1)); + cp = &md->patches[md->nb_patches]; + + memset(cp, 0, sizeof(*cp)); + + memcpy(cp->origin, md->gh->cctk_origin_space, sizeof(cp->origin)); + memcpy(cp->size, md->gh->cctk_lsh, sizeof(cp->size)); + memcpy(cp->delta, md->gh->cctk_levfac, sizeof(cp->delta)); + + for (i = 0; i < cp->size[1]; i++) + if (fabs(y[CCTK_GFINDEX3D(cctkGH, 0, i, 0)]) < 1e-8) { + cp->y_idx = i; + break; + } + if (i == cp->size[1]) + CCTK_WARN(0, "The grid does not include y==0"); + +#if MD_POLAR || 1 + posix_memalign((void**)&cp->transform_matrix, 32, sizeof(*cp->transform_matrix) * md->solver->nb_coeffs[0] * cp->size[0] * cp->size[2]); + posix_memalign((void**)&cp->transform_matrix1, 32, sizeof(*cp->transform_matrix1) * md->solver->nb_coeffs[1] * cp->size[0] * cp->size[2]); + posix_memalign((void**)&cp->transform_matrix2, 32, sizeof(*cp->transform_matrix2) * md->solver->nb_coeffs[0] * cp->size[0] * cp->size[2]); + posix_memalign((void**)&cp->transform_matrix3, 32, sizeof(*cp->transform_matrix3) * md->solver->nb_coeffs[1] * cp->size[0] * cp->size[2]); +#pragma omp parallel for + for (int j = 0; j < cp->size[2]; j++) { + double zz = z[CCTK_GFINDEX3D(md->gh, 0, 0, j)]; + + for (int i = 0; i < cp->size[0]; i++) { + const int idx_grid = j * cp->size[0] + i; + + double xx = x[CCTK_GFINDEX3D(md->gh, i, 0, 0)]; + double rr = sqrt(SQR(xx) + SQR(zz)); + + double coord0 = xx; + double coord1 = zz; + + //for (int k = 0; k < md->nb_coeffs_z; k++) + // for (int l = 0; l < md->nb_coeffs_x; l++) { + // const int idx_coeff = k * md->nb_coeffs_x + l; + // cp->transform_matrix[idx_grid + cp->size[0] * cp->size[2] * idx_coeff] = md->basis->eval(r, l) * md->basis1->eval(phi, k); + // } + for (int k = 0; k < md->solver->nb_coeffs[0]; k++) { + double dx = calc_basis_freq(md->solver->basis[0][0], k); + double r0 = MIN(60.0, dx * scale_factor); + double fact = exp(-36.0 * pow(rr / r0, scale_power)); + + cp->transform_matrix[idx_grid + cp->size[0] * cp->size[2] * k] = md_basis_eval(md->solver->basis[0][0], MD_BASIS_EVAL_TYPE_VALUE, coord0, k) * fact; + } + for (int k = 0; k < md->solver->nb_coeffs[1]; k++) { + double dx = calc_basis_freq(md->solver->basis[0][1], k); + double r0 = MIN(60.0, dx * scale_factor); + double fact = exp(-36.0 * pow(rr / r0, scale_power)); + + cp->transform_matrix1[idx_grid * md->solver->nb_coeffs[1] + k] = md_basis_eval(md->solver->basis[0][1], MD_BASIS_EVAL_TYPE_VALUE, coord1, k) * fact; + } + for (int k = 0; k < md->solver->nb_coeffs[0]; k++) { + double dx = calc_basis_freq(md->solver->basis[1][0], k); + double r0 = MIN(60.0, dx * scale_factor); + double fact = exp(-36.0 * pow(rr / r0, scale_power)); + + cp->transform_matrix2[idx_grid + cp->size[0] * cp->size[2] * k] = md_basis_eval(md->solver->basis[1][0], MD_BASIS_EVAL_TYPE_VALUE, coord0, k) * fact; + } + for (int k = 0; k < md->solver->nb_coeffs[1]; k++) { + double dx = calc_basis_freq(md->solver->basis[1][1], k); + double r0 = MIN(60.0, dx * scale_factor); + double fact = exp(-36.0 * pow(rr / r0, scale_power)); + + cp->transform_matrix3[idx_grid * md->solver->nb_coeffs[1] + k] = md_basis_eval(md->solver->basis[1][1], MD_BASIS_EVAL_TYPE_VALUE, coord1, k) * fact; + } + } + } + posix_memalign((void**)&cp->transform_tmp, 32, sizeof(*cp->transform_tmp) * cp->size[0] * cp->size[2] * md->solver->nb_coeffs[1]); +#else + posix_memalign((void**)&cp->basis_val_r, 32, sizeof(*cp->basis_val_r) * md->solver->nb_coeffs[0] * md->gh->cctk_lsh[1] * md->gh->cctk_lsh[0]); + for (int j = 0; j < md->gh->cctk_lsh[1]; j++) + for (int i = 0; i < md->gh->cctk_lsh[0]; i++) { + CCTK_REAL xx = x[CCTK_GFINDEX3D(md->gh, i, j, 0)]; + CCTK_REAL yy = y[CCTK_GFINDEX3D(md->gh, i, j, 0)]; + CCTK_REAL r = sqrt(SQR(xx) + SQR(yy)); + + for (int k = 0; k < md->solver->nb_coeffs[0]; k++) + //cp->basis_val_r [(j * md->gh->cctk_lsh[0] + i) * md->nb_coeffs_x + k] = md->basis->eval(r, k); + cp->basis_val_r [(j * md->gh->cctk_lsh[0] + i) + md->gh->cctk_lsh[1] * md->gh->cctk_lsh[0] * k] = md->solver->basis[0]->eval(r, k); + } + + posix_memalign((void**)&cp->basis_val_z, 32, sizeof(*cp->basis_val_z) * md->solver->nb_coeffs[1] * md->gh->cctk_lsh[2]); + for (int i = 0; i < md->gh->cctk_lsh[2]; i++) { + CCTK_REAL zz = z[CCTK_GFINDEX3D(md->gh, 0, 0, i)]; + for (int j = 0; j < md->solver->nb_coeffs[1]; j++) + cp->basis_val_z [i * md->solver->nb_coeffs[1] + j] = md->solver->basis[0]->eval(fabs(zz), j); + //cp->basis_val_z [i + md->gh->cctk_lsh[2] * j] = md->basis->eval(zz, j); + } + posix_memalign((void**)&cp->transform_z, 32, sizeof(*cp->transform_z) * cctkGH->cctk_lsh[2] * md->solver->nb_coeffs[0]); +#endif + +#if 0 + nb_threads = getenv("OMP_NUM_THREADS"); + if (nb_threads) + cp->nb_threads = atoi(nb_threads); + if (cp->nb_threads <= 0) + cp->nb_threads = 1; + md_threadpool_init(&cp->tp, cp->nb_threads); + cp->ec = calloc(cp->nb_threads, sizeof(*cp->ec)); + + block_size = (md->gh->cctk_lsh[2] + cp->nb_threads - 1) / cp->nb_threads; + + for (int i = 0; i < cp->nb_threads; i++) { + EvalContext *ec = &cp->ec[i]; + + ec->md = md; + + ec->nb_coeffs[0] = md->solver->nb_coeffs[0]; + ec->nb_coeffs[1] = md->solver->nb_coeffs[1]; + + posix_memalign((void**)&ec->eval_tmp[0], 32, sizeof(*ec->eval_tmp[0]) * ec->nb_coeffs[0]); + posix_memalign((void**)&ec->eval_tmp[1], 32, sizeof(*ec->eval_tmp[1]) * ec->nb_coeffs[1]); + + ec->x_idx_start = 0; + ec->x_idx_end = md->gh->cctk_lsh[0]; + + ec->z_idx_start = block_size * i; + ec->z_idx_end = MIN(block_size * (i + 1), md->gh->cctk_lsh[2]); + } +#endif + + md->nb_patches++; + return cp; +} + +static MDContext *md_context; + +static int context_init(cGH *cctkGH) +{ + int threads_type; + const int *threads = CCTK_ParameterGet("num_threads", "Carpet", &threads_type); + + MDContext *md; + int ret; + + DECLARE_CCTK_ARGUMENTS; + DECLARE_CCTK_PARAMETERS; + + md = calloc(1, sizeof(*md)); + if (!md) + return -ENOMEM; + + md->gh = cctkGH; + + ret = md_threadpool_init(&md->tp, *threads); + if (ret < 0) + return ret; + + ret = md_solver_init(&md->solver, cctkGH, md->tp, 2, + (unsigned int [2][2]){ { basis_order_r, basis_order_z }, + { basis_order_r, basis_order_z }}, + scale_factor, filter_power, 0.0); + if (ret < 0) + return ret; + + ret = posix_memalign((void**)&md->coeffs_eval, 32, + basis_order_r * basis_order_z * sizeof(*md->coeffs_eval)); + if (ret) + return -ENOMEM; + + for (int i = 0; i < ARRAY_ELEMS(md->solution_cache); i++) { + ret = posix_memalign((void**)&md->solution_cache[i].coeffs, 32, + 2 * basis_order_r * basis_order_z * sizeof(*md->solution_cache[i].coeffs)); + if (ret) + return -ENOMEM; + } + + md_context = md; + + return 0; +} + +void minimal_distortion_solve(CCTK_ARGUMENTS) +{ + MDContext *md; + + DECLARE_CCTK_ARGUMENTS; + DECLARE_CCTK_PARAMETERS; + + double time; + + if (!md_context) + context_init(cctkGH); + + md = md_context; + + time = cctkGH->cctk_time / md->gh->cctk_delta_time; + + //if (md->gh->cctk_levfac[0] != 1 || fabs(time - ceilf(time)) > 1e-8 || + // (md->nb_solutions && md->solution_cache[md->nb_solutions - 1].time == cctkGH->cctk_time)) + // return; + //if (md->gh->cctk_time < 10.0) + // return; + + CCTK_TimerStart("MinimalDistortion_Solve"); + md_solver_solve(md->solver); + CCTK_TimerStop("MinimalDistortion_Solve"); + + fprintf(stderr, "%d md solve: time %g %g %g\n", md->gh->cctk_levfac[0], md->gh->cctk_time, time, md->solver->coeffs[0]); + if (1) { + double *tmp; + if (md->nb_solutions == ARRAY_ELEMS(md->solution_cache)) { + tmp = md->solution_cache[0].coeffs; + memmove(md->solution_cache, md->solution_cache + 1, sizeof(md->solution_cache[0]) * (ARRAY_ELEMS(md->solution_cache) - 1)); + } else { + md->nb_solutions++; + tmp = md->solution_cache[md->nb_solutions - 1].coeffs; + } + md->solution_cache[md->nb_solutions - 1].coeffs = md->solver->coeffs; + md->solution_cache[md->nb_solutions - 1].time = md->gh->cctk_time; + + md->solver->coeffs = tmp; + } +} + +double md_scalarproduct_metric_avx(size_t len1, size_t len2, const double *mat, + const double *vec1, const double *vec2); + +static double md_scalarproduct_metric_c(size_t len1, size_t len2, double *mat, + double *vec1, double *vec2) +{ + double val = 0.0; + for (int l = 0; l < len2; l++) { + double tmp = 0.0; + for (int m = 0; m < len1; m++) + tmp += mat[l * len1 + m] * vec1[m]; + + val += tmp * vec2[l]; + } + return val; +} + +#if 0 +static void md_eval(void *arg, + unsigned int job_id, unsigned int nb_jobs, + unsigned int thread_idx, unsigned int nb_threads) +{ + EvalContext *e = (EvalContext*)arg + job_id; + CoordPatch *cp = e->cp; + MDContext *md = e->md; + const cGH *gh = e->md->gh; + double *W = e->W; + + for (int k = e->z_idx_start; k < e->z_idx_end; k++) { + for (int i = e->x_idx_start; i < e->x_idx_end; i++) { + int idx = CCTK_GFINDEX3D(gh, i, cp->y_idx, k); + double xx = e->x[idx]; + double zz = e->z[idx]; + double r = sqrt(SQR(xx) + SQR(zz)); + double phi = atan2(zz, xx); + + double *basis_vec1 = e->eval_tmp[0]; + double *basis_vec2 = e->eval_tmp[1]; + + for (int l = 0; l < e->nb_coeffs[0]; l++) + basis_vec1[l] = md->solver->basis[0]->eval(r, l); + for (int l = 0; l < e->nb_coeffs[0]; l++) + basis_vec2[l] = md->solver->basis[1]->eval(phi, l); + + W[idx] = md_scalarproduct_metric_avx(e->nb_coeffs[0], e->nb_coeffs[1], e->coeffs, + basis_vec1, basis_vec2); + } + } +} +#endif + +void minimal_distortion_eval(CCTK_ARGUMENTS) +{ + MDContext *md; + + CoordPatch *cp; + + DECLARE_CCTK_ARGUMENTS; + DECLARE_CCTK_PARAMETERS; + + double *beta1 = CCTK_VarDataPtr(cctkGH, 0, "ML_BSSN::beta1"); + double *beta3 = CCTK_VarDataPtr(cctkGH, 0, "ML_BSSN::beta3"); + + double time; + + int64_t expand_start; + + double *coeffs = NULL; + int i, ret; + + if (!md_context) + context_init(cctkGH); + + time = cctkGH->cctk_time; + + md = md_context; + + cp = get_coord_patch(md, x, y, z, scale_factor, scale_power); + +#if 1 + //coeffs = md->coeffs; + coeffs = md->solution_cache[md->nb_solutions - 1].coeffs; +#elif 0 + if (time < 10.0) { + return; + } else if (time < 11.0) { + double fact = exp(-36.0 * pow((10.0 - time), 4.0)); + double *coeffs_src = md->solution_cache[md->nb_solutions - 1].coeffs; + + coeffs = md->coeffs_eval; + for (int i = 0; i < md->solver->nb_coeffs[0] * md->solver->nb_coeffs[1] * 2; i++) + coeffs[i] = coeffs_src[i] * fact; + } else + coeffs = md->solution_cache[md->nb_solutions - 1].coeffs; + +#else + coeffs = md->coeffs_eval; + + if (cctkGH->cctk_levfac[0] < 1 || md->nb_solutions < 2) { + memset(coeffs, 0, sizeof(*coeffs) * md->solver->nb_coeffs[0] * md->solver->nb_coeffs[1]); + //fprintf(stderr, "md eval: time %g zero\n", md->gh->cctk_time); + } else { + double *coeffs0 = md->solution_cache[md->nb_solutions - 2].coeffs; + double *coeffs1 = md->solution_cache[md->nb_solutions - 1].coeffs; + double time0 = md->solution_cache[md->nb_solutions - 2].time; + double time1 = md->solution_cache[md->nb_solutions - 1].time; + + double fact = 1.0; + + //if (time < 9.0) + // fact = 1.0; + //else + // fact = exp(-36.0 * pow((time - 9.0), 4.0)); + //else if (time < 0.1) + // fact = 0.0; + //else + // fact = (1.0 - exp(-pow((time - 0.0) / 0.25, 4.0))); + //fact = 1.0; + + //fprintf(stderr, "md eval: time %g interp from %g %g %g\n", md->gh->cctk_time, time0, time1, fact); + + for (int i = 0; i < 2 * md->solver->nb_coeffs[0] * md->solver->nb_coeffs[1]; i++) + coeffs[i] = (coeffs1[i] * (time - time0) / (time1 - time0) + coeffs0[i] * (time - time1) / (time0 - time1)) * fact; + + } +#endif + + if (export_coeffs) { + memcpy(betax_coeffs, coeffs, sizeof(*coeffs) * md->solver->nb_coeffs[0] * md->solver->nb_coeffs[1]); + memcpy(betaz_coeffs, coeffs + md->solver->nb_coeffs[0] * md->solver->nb_coeffs[1], + sizeof(*coeffs) * md->solver->nb_coeffs[0] * md->solver->nb_coeffs[1]); + } + + CCTK_TimerStart("MinimalDistortion_Expand"); + expand_start = gettime(); +#if 0 +#pragma omp parallel for + for (int k = 0; k < cctk_lsh[2]; k++) { + for (int i = 0; i < cctk_lsh[0]; i++) { + int idx = CCTK_GFINDEX3D(cctkGH, i, cp->y_idx, k); + double xx = x[idx]; + double zz = z[idx]; + double r = sqrt(SQR(xx) + SQR(zz)); + double phi = atan2(zz, xx); + + double val = 0.0; + + for (int l = 0; l < md->nb_coeffs_z; l++) { + double tmp = 0.0; + for (int m = 0; m < md->nb_coeffs_x; m++) { + const int idx_coeff = l * md->nb_coeffs_x + m; + tmp += coeffs[idx_coeff] * md->basis->eval(r, m); + } + val += tmp * md->basis1->eval(phi, l); + } + + W[idx] = val; + } + } +#elif 0 + { + for (int i = 0; i < cp->nb_threads; i++) { + cp->ec[i].cp = cp; + cp->ec[i].x = x; + cp->ec[i].z = z; + cp->ec[i].W = W; + cp->ec[i].coeffs = coeffs; + } + md_threadpool_execute(cp->tp, cp->nb_threads, md_eval, cp->ec); + } +#elif MD_POLAR || 1 + cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, + cctk_lsh[0] * cctk_lsh[2], md->solver->nb_coeffs[1], md->solver->nb_coeffs[0], + 1.0, cp->transform_matrix, cctk_lsh[0] * cctk_lsh[2], + coeffs, md->solver->nb_coeffs[0], 0.0, cp->transform_tmp, cctk_lsh[0] * cctk_lsh[2]); +#pragma omp parallel for + for (int j = 0; j < cctk_lsh[2]; j++) + for (int i = 0; i < cctk_lsh[0]; i++) { + const int idx_grid = j * cctk_lsh[0] + i; + const double val = cblas_ddot(md->solver->nb_coeffs[1], cp->transform_matrix1 + idx_grid * md->solver->nb_coeffs[1], 1, + cp->transform_tmp + idx_grid, cctk_lsh[0] * cctk_lsh[2]); + beta1[CCTK_GFINDEX3D(cctkGH, i, cp->y_idx, j)] = val; + } + cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, + cctk_lsh[0] * cctk_lsh[2], md->solver->nb_coeffs[1], md->solver->nb_coeffs[0], + 1.0, cp->transform_matrix2, cctk_lsh[0] * cctk_lsh[2], + coeffs + md->solver->nb_coeffs[0] * md->solver->nb_coeffs[1], + md->solver->nb_coeffs[0], 0.0, cp->transform_tmp, cctk_lsh[0] * cctk_lsh[2]); +#pragma omp parallel for + for (int j = 0; j < cctk_lsh[2]; j++) + for (int i = 0; i < cctk_lsh[0]; i++) { + const int idx_grid = j * cctk_lsh[0] + i; + const double val = cblas_ddot(md->solver->nb_coeffs[1], cp->transform_matrix3 + idx_grid * md->solver->nb_coeffs[1], 1, + cp->transform_tmp + idx_grid, cctk_lsh[0] * cctk_lsh[2]); + beta3[CCTK_GFINDEX3D(cctkGH, i, cp->y_idx, j)] = val; + } +#else + memset(W, 0, cctk_lsh[0] * cctk_lsh[1] * cctk_lsh[2] * sizeof(*W)); + cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, + md->solver->nb_coeffs[0], cctk_lsh[2], md->solver->nb_coeffs[1], 1.0, + coeffs, md->solver->nb_coeffs[0], cp->basis_val_z, md->solver->nb_coeffs[1], + 0.0, cp->transform_z, md->solver->nb_coeffs[0]); + cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, + cctk_lsh[1] * cctk_lsh[0], cctk_lsh[2], md->solver->nb_coeffs[0], 1.0, + cp->basis_val_r, cctk_lsh[0] * cctk_lsh[1], cp->transform_z, md->solver->nb_coeffs[0], + 1.0, W, cctk_lsh[0] * cctk_lsh[1]); +#endif + + md->grid_expand_time += gettime() - expand_start; + md->grid_expand_count++; + + CCTK_TimerStop("MinimalDistortion_Expand"); + + /* print stats */ + if (!(md->grid_expand_count & 255)) { + fprintf(stderr, "Minimal distortion stats:\n"); + + md_solver_print_stats(md->solver); + + fprintf(stderr, + "%lu evals: total time %g s, avg time per call %g md\n", + md->grid_expand_count, (double)md->grid_expand_time / 1e6, + (double)md->grid_expand_time / md->grid_expand_count / 1e3); + } +} + +void minimal_distortion_init(CCTK_ARGUMENTS) +{ + DECLARE_CCTK_ARGUMENTS; + DECLARE_CCTK_PARAMETERS; + + if (!md_context) + context_init(cctkGH); +} diff --git a/src/md.h b/src/md.h new file mode 100644 index 0000000..0a4a917 --- /dev/null +++ b/src/md.h @@ -0,0 +1,19 @@ +#ifndef MD_MD_H +#define MD_MD_H + +#include "common.h" + +#if HAVE_OPENCL +#include <cl.h> +#endif + +#include <inttypes.h> + +#include "cctk.h" + +#include "md_solve.h" +#include "threadpool.h" + +typedef struct MDContext MDContext; + +#endif /* MD_MD_H */ diff --git a/src/md_solve.c b/src/md_solve.c new file mode 100644 index 0000000..c7fa329 --- /dev/null +++ b/src/md_solve.c @@ -0,0 +1,818 @@ +/* + * Minimal distortion -- actual solver code + * Copyright (C) 2016 Anton Khirnov <anton@khirnov.net> + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#include "common.h" + +#include <errno.h> +#include <math.h> +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#if HAVE_OPENCL +#include <cl.h> +#include <clBLAS.h> +#endif + +#include "cctk.h" +#include "cctk_Timers.h" +#include "util_Table.h" + +#include "basis.h" +#include "pssolve.h" +#include "md_solve.h" +#include "threadpool.h" + +#define NB_COEFFS(md) (md->nb_coeffs[0] * md->nb_coeffs[1]) +#define NB_COLLOC_POINTS(md) (md->nb_colloc_points[0] * md->nb_colloc_points[1]) + +/* indices (in our code, not cactus structs) of the grid functions which we'll need to + * interpolate on the pseudospectral grid */ +enum MetricVars { + GTXX = 0, + GTYY, + GTZZ, + GTXY, + GTXZ, + GTYZ, + PHI, + ATXX, + ATYY, + ATZZ, + ATXY, + ATXZ, + ATYZ, + XTX, + XTY, + XTZ, + ALPHA, + TRK, + NB_METRIC_VARS, +}; + +/* indices of the interpolated values of the above grid functions and their derivatives */ +enum InterpMetricVars { + I_GTXX = 0, + I_GTYY, + I_GTZZ, + I_GTXY, + I_GTXZ, + I_GTYZ, + I_GTXX_DX, + I_GTYY_DX, + I_GTZZ_DX, + I_GTXZ_DX, + I_GTXX_DZ, + I_GTYY_DZ, + I_GTZZ_DZ, + I_GTXZ_DZ, + I_GTXX_DXX, + I_GTYY_DXX, + I_GTZZ_DXX, + I_GTXZ_DXX, + I_GTXX_DXZ, + I_GTYY_DXZ, + I_GTZZ_DXZ, + I_GTXZ_DXZ, + I_GTXX_DZZ, + I_GTYY_DZZ, + I_GTZZ_DZZ, + I_GTXZ_DZZ, + I_PHI, + I_PHI_DX, + I_PHI_DY, + I_PHI_DZ, + I_PHI_DXX, + I_PHI_DZZ, + I_PHI_DXZ, + I_ATXX, + I_ATYY, + I_ATZZ, + I_ATXY, + I_ATXZ, + I_ATYZ, + I_ATXX_DX, + I_ATYY_DX, + I_ATZZ_DX, + I_ATXZ_DX, + I_ATXX_DZ, + I_ATYY_DZ, + I_ATZZ_DZ, + I_ATXZ_DZ, + I_XTX, + I_XTY, + I_XTZ, + I_ALPHA, + I_ALPHA_DX, + I_ALPHA_DY, + I_ALPHA_DZ, + I_TRK, + I_TRK_DX, + I_TRK_DZ, + NB_INTERP_VARS, +}; + +/* per-equation state */ +typedef struct MDEquationContext { + double *interp_coords[3]; + double *interp_values[NB_INTERP_VARS]; + + /* eq_coeffs[i][j] is an array of coefficients at the collocation points + * for j-th derivative of i-th unknown function */ + double *(*eq_coeffs)[PSSOLVE_DIFF_ORDER_NB]; + + double *rhs; +} MDEquationContext; + +struct MDSolverPriv { + PSSolveContext *ps_ctx; + cGH *gh; + + MDEquationContext *eqs; + + int colloc_grid_order[2]; + + double *rhs; + + double *coeff_scale; + + // interpolation parameters + int coord_system; + int interp_operator; + int interp_params; + + CCTK_REAL *interp_coords[3]; + + int interp_vars_indices[NB_METRIC_VARS]; + CCTK_REAL *interp_values[NB_INTERP_VARS]; + CCTK_INT interp_value_codes[NB_INTERP_VARS]; + +#if HAVE_OPENCL + // OpenCL / CLBLAS stuff + cl_context ocl_ctx; + cl_command_queue ocl_queue; +#endif + + ThreadPoolContext *tp; + ThreadPoolContext *tp_internal; + + uint64_t solve_count; + uint64_t solve_time; + + uint64_t interp_geometry_count; + uint64_t interp_geometry_time; + + uint64_t calc_eq_coeffs_count; + uint64_t calc_eq_coeffs_time; +}; + +typedef struct MDCalcEqThread { + MDSolver *ctx; + MDEquationContext *eq_ctx; + size_t block_size; +} MDCalcEqThread; + +/* mapping between our indices and thorn names */ +static const char *metric_vars[] = { + [GTXX] = "ML_BSSN::gt11", + [GTYY] = "ML_BSSN::gt22", + [GTZZ] = "ML_BSSN::gt33", + [GTXY] = "ML_BSSN::gt12", + [GTXZ] = "ML_BSSN::gt13", + [GTYZ] = "ML_BSSN::gt23", + [ATXX] = "ML_BSSN::At11", + [ATYY] = "ML_BSSN::At22", + [ATZZ] = "ML_BSSN::At33", + [ATXY] = "ML_BSSN::At12", + [ATXZ] = "ML_BSSN::At13", + [ATYZ] = "ML_BSSN::At23", + [PHI] = "ML_BSSN::phi", + [XTX] = "ML_BSSN::Xt1", + [XTY] = "ML_BSSN::Xt2", + [XTZ] = "ML_BSSN::Xt3", + [ALPHA] = "ML_BSSN::alpha", + [TRK] = "ML_BSSN::trK", +}; + +/* mapping between the cactus grid values and interpolated values */ +static const CCTK_INT interp_operation_indices[] = { + [I_GTXX] = GTXX, + [I_GTYY] = GTYY, + [I_GTZZ] = GTZZ, + [I_GTXY] = GTXY, + [I_GTXZ] = GTXZ, + [I_GTYZ] = GTYZ, + [I_GTXX_DX] = GTXX, + [I_GTYY_DX] = GTYY, + [I_GTZZ_DX] = GTZZ, + [I_GTXZ_DX] = GTXZ, + [I_GTXX_DZ] = GTXX, + [I_GTYY_DZ] = GTYY, + [I_GTZZ_DZ] = GTZZ, + [I_GTXZ_DZ] = GTXZ, + [I_GTXX_DXX] = GTXX, + [I_GTYY_DXX] = GTYY, + [I_GTZZ_DXX] = GTZZ, + [I_GTXZ_DXX] = GTXZ, + [I_GTXX_DXZ] = GTXX, + [I_GTYY_DXZ] = GTYY, + [I_GTZZ_DXZ] = GTZZ, + [I_GTXZ_DXZ] = GTXZ, + [I_GTXX_DZZ] = GTXX, + [I_GTYY_DZZ] = GTYY, + [I_GTZZ_DZZ] = GTZZ, + [I_GTXZ_DZZ] = GTXZ, + [I_PHI] = PHI, + [I_PHI_DX] = PHI, + [I_PHI_DY] = PHI, + [I_PHI_DZ] = PHI, + [I_PHI_DXX] = PHI, + [I_PHI_DZZ] = PHI, + [I_PHI_DXZ] = PHI, + [I_ATXX] = ATXX, + [I_ATYY] = ATYY, + [I_ATZZ] = ATZZ, + [I_ATXY] = ATXY, + [I_ATXZ] = ATXZ, + [I_ATYZ] = ATYZ, + [I_ATXX_DX] = ATXX, + [I_ATYY_DX] = ATYY, + [I_ATZZ_DX] = ATZZ, + [I_ATXZ_DX] = ATXZ, + [I_ATXX_DZ] = ATXX, + [I_ATYY_DZ] = ATYY, + [I_ATZZ_DZ] = ATZZ, + [I_ATXZ_DZ] = ATXZ, + [I_XTX] = XTX, + [I_XTY] = XTY, + [I_XTZ] = XTZ, + [I_ALPHA] = ALPHA, + [I_ALPHA_DX] = ALPHA, + [I_ALPHA_DY] = ALPHA, + [I_ALPHA_DZ] = ALPHA, + [I_TRK] = TRK, + [I_TRK_DX] = TRK, + [I_TRK_DZ] = TRK, +}; + +/* the operation (plain value or x/y/z-derivative) to apply during interpolation */ +static const CCTK_INT interp_operation_codes[] = { + [I_GTXX] = 0, + [I_GTYY] = 0, + [I_GTZZ] = 0, + [I_GTXY] = 0, + [I_GTXZ] = 0, + [I_GTYZ] = 0, + [I_GTXX_DX] = 1, + [I_GTYY_DX] = 1, + [I_GTZZ_DX] = 1, + [I_GTXZ_DX] = 1, + [I_GTXX_DZ] = 3, + [I_GTYY_DZ] = 3, + [I_GTZZ_DZ] = 3, + [I_GTXZ_DZ] = 3, + [I_GTXX_DXX] = 11, + [I_GTYY_DXX] = 11, + [I_GTZZ_DXX] = 11, + [I_GTXZ_DXX] = 11, + [I_GTXX_DXZ] = 13, + [I_GTYY_DXZ] = 13, + [I_GTZZ_DXZ] = 13, + [I_GTXZ_DXZ] = 13, + [I_GTXX_DZZ] = 33, + [I_GTYY_DZZ] = 33, + [I_GTZZ_DZZ] = 33, + [I_GTXZ_DZZ] = 33, + [I_PHI] = 0, + [I_PHI_DX] = 1, + [I_PHI_DY] = 2, + [I_PHI_DZ] = 3, + [I_PHI_DXX] = 11, + [I_PHI_DZZ] = 33, + [I_PHI_DXZ] = 13, + [I_ATXX] = 0, + [I_ATYY] = 0, + [I_ATZZ] = 0, + [I_ATXY] = 0, + [I_ATXZ] = 0, + [I_ATYZ] = 0, + [I_ATXX_DX] = 1, + [I_ATYY_DX] = 1, + [I_ATZZ_DX] = 1, + [I_ATXZ_DX] = 1, + [I_ATXX_DZ] = 3, + [I_ATYY_DZ] = 3, + [I_ATZZ_DZ] = 3, + [I_ATXZ_DZ] = 3, + [I_XTX] = 0, + [I_XTY] = 0, + [I_XTZ] = 0, + [I_ALPHA] = 0, + [I_ALPHA_DX] = 1, + [I_ALPHA_DY] = 2, + [I_ALPHA_DZ] = 3, + [I_TRK] = 0, + [I_TRK_DX] = 1, + [I_TRK_DZ] = 3, +}; + +/* interpolate the cactus gridfunctions onto the pseudospectral grid */ +static int interp_geometry(MDSolver *ctx) +{ + MDSolverPriv *s = ctx->priv; + int ret; + + for (int i = 0; i < ctx->nb_equations; i++) { + MDEquationContext *eq_ctx = &s->eqs[i]; + + ret = CCTK_InterpGridArrays(s->gh, 3, s->interp_operator, s->interp_params, + s->coord_system, NB_COLLOC_POINTS(ctx), CCTK_VARIABLE_REAL, + (const void * const *)eq_ctx->interp_coords, ARRAY_ELEMS(s->interp_vars_indices), s->interp_vars_indices, + ARRAY_ELEMS(eq_ctx->interp_values), s->interp_value_codes, (void * const *)eq_ctx->interp_values); + if (ret < 0) + CCTK_WARN(0, "Error interpolating"); + } + + return 0; +} + +#if 0 +#define EQUATION 0 +#include "md_solve_template.c" +#undef EQUATION + +#define EQUATION 1 +#include "md_solve_template.c" +#undef EQUATION +#else +#define EQUATION 0 +#include "gamma_freeze_template.c" +#undef EQUATION + +#define EQUATION 1 +#include "gamma_freeze_template.c" +#undef EQUATION +#endif + +static void (*calc_eq_coeffs[2])(void *, unsigned int, unsigned int, + unsigned int, unsigned int) = { + calc_eq_coeffs_0, + calc_eq_coeffs_1, +}; + +int md_solver_solve(MDSolver *ctx) +{ + MDSolverPriv *s = ctx->priv; + const double *(*eq_coeffs[2])[PSSOLVE_DIFF_ORDER_NB]; + int ret; + int64_t start, totaltime_start; + + totaltime_start = gettime(); + + /* interpolate the metric values and construct the quantities we'll need */ + CCTK_TimerStart("MinimalDistortion_interp_geometry"); + start = gettime(); + + ret = interp_geometry(ctx); + + s->interp_geometry_time += gettime() - start; + s->interp_geometry_count++; + CCTK_TimerStop("MinimalDistortion_interp_geometry"); + if (ret < 0) + return ret; + + CCTK_TimerStart("MinimalDistortion_calc_eq_coeffs"); + start = gettime(); + + for (int i = 0; i < ctx->nb_equations; i++) { + MDCalcEqThread thread = { + .ctx = ctx, + .eq_ctx = &s->eqs[i], + .block_size = 256, + }; + + md_threadpool_execute(s->tp, (NB_COLLOC_POINTS(ctx) + thread.block_size - 1) / thread.block_size, + calc_eq_coeffs[i], &thread); + } + + eq_coeffs[0] = s->eqs[0].eq_coeffs; + eq_coeffs[1] = s->eqs[1].eq_coeffs; + + s->calc_eq_coeffs_time += gettime() - start; + s->calc_eq_coeffs_count++; + CCTK_TimerStop("MinimalDistortion_calc_eq_coeffs"); + if (ret < 0) + return ret; + + ret = md_pssolve_solve(s->ps_ctx, + eq_coeffs, + s->rhs, ctx->coeffs); + if (ret < 0) + return ret; + + //for (int i = 0; i < ctx->nb_equations * NB_COEFFS(ctx); i++) + // ctx->coeffs[i] *= s->coeff_scale[i]; + + s->solve_count++; + s->solve_time += gettime() - totaltime_start; + + return 0; +} + +void md_solver_print_stats(MDSolver *ctx) +{ + MDSolverPriv *s = ctx->priv; + + fprintf(stderr, + "%g%% interpolate geometry: %lu, " + "total time %g s, avg time per call %g ms\n", + (double)s->interp_geometry_time * 100 / s->solve_time, + s->interp_geometry_count, (double)s->interp_geometry_time / 1e6, + (double)s->interp_geometry_time / s->interp_geometry_count / 1e3); + fprintf(stderr, + "%g%% calc equation coefficients: %lu, " + "total time %g s, avg time per call %g ms\n", + (double)s->calc_eq_coeffs_time * 100 / s->solve_time, + s->calc_eq_coeffs_count, (double)s->calc_eq_coeffs_time / 1e6, + (double)s->calc_eq_coeffs_time / s->calc_eq_coeffs_count / 1e3); + fprintf(stderr, + "%g%% pseudospectral matrix construction: %lu, " + "total time %g s, avg time per call %g ms\n", + (double)s->ps_ctx->construct_matrix_time * 100 / s->solve_time, + s->ps_ctx->construct_matrix_count, (double)s->ps_ctx->construct_matrix_time / 1e6, + (double)s->ps_ctx->construct_matrix_time / s->ps_ctx->construct_matrix_count / 1e3); + fprintf(stderr, + "%g%% BiCGSTAB %lu solves, " + "%lu iterations, total time %g s, " + "avg iterations per solve %g, avg time per solve %g ms, " + "avg time per iteration %g ms\n", + (double)s->ps_ctx->cg_time_total * 100 / s->solve_time, + s->ps_ctx->cg_solve_count, s->ps_ctx->cg_iter_count, (double)s->ps_ctx->cg_time_total / 1e6, + (double)s->ps_ctx->cg_iter_count / s->ps_ctx->cg_solve_count, + (double)s->ps_ctx->cg_time_total / s->ps_ctx->cg_solve_count / 1e3, + (double)s->ps_ctx->cg_time_total / s->ps_ctx->cg_iter_count / 1e3); + fprintf(stderr, + "%g%% LU %lu solves, total time %g s, avg time per solve %g ms\n", + (double)s->ps_ctx->lu_solves_time * 100 / s->solve_time, + s->ps_ctx->lu_solves_count, (double)s->ps_ctx->lu_solves_time / 1e6, + (double)s->ps_ctx->lu_solves_time / s->ps_ctx->lu_solves_count / 1e3); +} + +static void init_opencl(MDSolver *ctx) +#if HAVE_OPENCL +{ + MDSolverPriv *s = ctx->priv; + int err, count; + cl_platform_id platform; + cl_context_properties props[3]; + cl_device_id ocl_device; + + err = clGetPlatformIDs(1, &platform, &count); + if (err != CL_SUCCESS || count < 1) { + fprintf(stderr, "Could not get an OpenCL platform ID\n"); + return; + } + + err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &ocl_device, &count); + if (err != CL_SUCCESS || count < 1) { + fprintf(stderr, "Could not get an OpenCL device ID\n"); + return; + } + + props[0] = CL_CONTEXT_PLATFORM; + props[1] = (cl_context_properties)platform; + props[2] = 0; + + s->ocl_ctx = clCreateContext(props, 1, &ocl_device, NULL, NULL, &err); + if (err != CL_SUCCESS || !s->ocl_ctx) { + fprintf(stderr, "Could not create an OpenCL context\n"); + return; + } + + s->ocl_queue = clCreateCommandQueue(s->ocl_ctx, ocl_device, 0, &err); + if (err != CL_SUCCESS || !s->ocl_queue) { + fprintf(stderr, "Could not create an OpenCL command queue: %d\n", err); + goto fail; + } + + err = clblasSetup(); + if (err != CL_SUCCESS) { + fprintf(stderr, "Error setting up clBLAS\n"); + goto fail; + } + + return; +fail: + if (s->ocl_queue) + clReleaseCommandQueue(s->ocl_queue); + s->ocl_queue = 0; + + if (s->ocl_ctx) + clReleaseContext(s->ocl_ctx); + s->ocl_ctx = 0; +} +#else +{ +} +#endif + +static int eq_init(MDSolver *ctx, unsigned int eq_idx) +{ + MDSolverPriv *s = ctx->priv; + MDEquationContext *eq_ctx = &s->eqs[eq_idx]; + double *colloc_grid[2] = { s->ps_ctx->colloc_grid[eq_idx][0], + s->ps_ctx->colloc_grid[eq_idx][1] }; + int ret; + + /* prepare the state for the cactus interpolator */ + for (int i = 0; i < ARRAY_ELEMS(eq_ctx->interp_coords); i++) { + ret = posix_memalign((void**)&eq_ctx->interp_coords[i], 32, + NB_COLLOC_POINTS(ctx) * sizeof(*eq_ctx->interp_coords[i])); + if (ret) + return -ENOMEM; + } + + for (int j = 0; j < ctx->nb_colloc_points[1]; j++) { + for (int i = 0; i < ctx->nb_colloc_points[0]; i++) { + eq_ctx->interp_coords[0][j * ctx->nb_colloc_points[0] + i] = colloc_grid[0][i]; + eq_ctx->interp_coords[1][j * ctx->nb_colloc_points[0] + i] = 0; + eq_ctx->interp_coords[2][j * ctx->nb_colloc_points[0] + i] = colloc_grid[1][j]; + } + } + + for (int i = 0; i < ARRAY_ELEMS(eq_ctx->interp_values); i++) { + ret = posix_memalign((void**)&eq_ctx->interp_values[i], 32, + NB_COLLOC_POINTS(ctx) * sizeof(*eq_ctx->interp_values[i])); + if (ret) + return -ENOMEM; + } + + /* allocate the equation coefficients */ + eq_ctx->eq_coeffs = calloc(ctx->nb_equations, sizeof(*eq_ctx->eq_coeffs)); + if (!eq_ctx->eq_coeffs) + return -ENOMEM; + for (int i = 0; i < ctx->nb_equations; i++) + for (int j = 0; j < ARRAY_ELEMS(eq_ctx->eq_coeffs[i]); j++) { + ret = posix_memalign((void**)&eq_ctx->eq_coeffs[i][j], 32, + NB_COLLOC_POINTS(ctx) * sizeof(*eq_ctx->eq_coeffs[i][j])); + if (ret) + return -ENOMEM; + } + + /* setup the RHS pointer */ + if (eq_idx == 0) + eq_ctx->rhs = s->rhs; + else + eq_ctx->rhs = s->eqs[eq_idx - 1].rhs + NB_COLLOC_POINTS(ctx); + + return 0; +} + +static const enum MDBasisFamily basis_sets[2][2] = { + { MD_BASIS_FAMILY_SB_ODD, MD_BASIS_FAMILY_SB_EVEN }, + { MD_BASIS_FAMILY_SB_EVEN, MD_BASIS_FAMILY_SB_ODD }, +}; + +int md_solver_init(MDSolver **pctx, + cGH *cctkGH, ThreadPoolContext *tp, + unsigned int nb_equations, + unsigned int (*basis_order)[2], + double sf, double filter_power, double input_filter_power) +{ + MDSolver *ctx; + MDSolverPriv *s; + int max_order = 0; + int ret; + + ctx = calloc(1, sizeof(*ctx)); + if (!ctx) + return -ENOMEM; + + ctx->priv = calloc(1, sizeof(*ctx->priv)); + if (!ctx->priv) + goto fail; + s = ctx->priv; + + s->gh = cctkGH; + + if (tp) { + s->tp = tp; + } else { + ret = md_threadpool_init(&s->tp_internal, 1); + if (ret < 0) + goto fail; + s->tp = s->tp_internal; + } + + s->eqs = calloc(nb_equations, sizeof(*s->eqs)); + if (!s->eqs) + goto fail; + ctx->nb_equations = nb_equations; + + ctx->nb_coeffs[0] = basis_order[0][0]; + ctx->nb_coeffs[1] = basis_order[0][1]; + + ctx->nb_colloc_points[0] = basis_order[0][0]; + ctx->nb_colloc_points[1] = basis_order[0][1]; + + if (NB_COLLOC_POINTS(ctx) != NB_COEFFS(ctx)) + CCTK_WARN(0, "Non-square collocation matrix"); + + s->colloc_grid_order[0] = ctx->nb_colloc_points[0]; + s->colloc_grid_order[1] = ctx->nb_colloc_points[1]; + + ret = posix_memalign((void**)&ctx->coeffs, 32, sizeof(*ctx->coeffs) * nb_equations * NB_COEFFS(ctx)); + ret |= posix_memalign((void**)&s->rhs, 32, sizeof(*s->rhs) * nb_equations * NB_COLLOC_POINTS(ctx)); + if (ret) + goto fail; + + for (int i = 0; i < ctx->nb_equations; i++) + for (int j = 0; j < 2; j++) { + double sf; + + ret = md_basis_init(&ctx->basis[i][j], basis_sets[i][j], 1.0); + if (ret < 0) + goto fail; + + sf = 64.0 / md_basis_colloc_point(ctx->basis[i][j], s->colloc_grid_order[j], + ctx->nb_colloc_points[j] - 1); + md_basis_free(&ctx->basis[i][j]); + + ret = md_basis_init(&ctx->basis[i][j], basis_sets[i][j], sf); + if (ret < 0) + goto fail; + } + + init_opencl(ctx); + + ret = md_pssolve_context_alloc(&s->ps_ctx, 2); + if (ret < 0) + CCTK_WARN(0, "Error allocating the pseudospectral solver"); + + for (int i = 0; i < 2; i++) + for (int j = 0; j < 2; j++) { + s->ps_ctx->basis[i][j] = ctx->basis[i][j]; + s->ps_ctx->solve_order[i][j] = basis_order[i][j]; + max_order = MAX(max_order, basis_order[i][j]); + } + + s->ps_ctx->tp = s->tp; + +#if HAVE_OPENCL + s->ps_ctx->ocl_ctx = s->ocl_ctx; + s->ps_ctx->ocl_queue = s->ocl_queue; +#endif + + ret = md_pssolve_context_init(s->ps_ctx); + if (ret < 0) + CCTK_WARN(0, "Error initializing the pseudospectral solver"); + + for (int i = 0; i < max_order; i++) { + fprintf(stderr, "%d ", i); + for (int j = 0; j < 2; j++) + for (int k = 0; k < 2; k++) { + if (i < s->ps_ctx->solve_order[j][k]) + fprintf(stderr, "%8.8g\t", s->ps_ctx->colloc_grid[j][k][i]); + else + fprintf(stderr, " "); + } + fprintf(stderr, "\n"); + } + + /* init the per-equation state */ + for (int i = 0; i < ctx->nb_equations; i++) { + ret = eq_init(ctx, i); + if (ret < 0) + goto fail; + } + + ret = posix_memalign((void**)&s->coeff_scale, 32, 2 * NB_COEFFS(ctx) * sizeof(*s->coeff_scale)); + if (ret) + goto fail; + for (int j = 0; j < ctx->nb_coeffs[1]; j++) + for (int i = 0; i < ctx->nb_coeffs[0]; i++) { + s->coeff_scale[j * ctx->nb_coeffs[0] + i] = exp(-36.0 * pow((double)i / ctx->nb_coeffs[0], filter_power)) * + exp(-36.0 * pow((double)j / ctx->nb_coeffs[1], filter_power)); + s->coeff_scale[NB_COEFFS(ctx) + j * ctx->nb_coeffs[0] + i] = exp(-36.0 * pow((double)i / ctx->nb_coeffs[0], filter_power)) * + exp(-36.0 * pow((double)j / ctx->nb_coeffs[1], filter_power)); + } + + for (int i = 0; i < ARRAY_ELEMS(s->interp_values); i++) { +#if 0 + ret = posix_memalign((void**)&s->interp_values[i], 32, + 2 * NB_COLLOC_POINTS(ctx) * sizeof(*s->interp_values[i])); + if (ret) + goto fail; +#endif + s->interp_value_codes[i] = CCTK_VARIABLE_REAL; + } + + for (int i = 0; i < ARRAY_ELEMS(metric_vars); i++) { + s->interp_vars_indices[i] = CCTK_VarIndex(metric_vars[i]); + if (s->interp_vars_indices[i] < 0) + CCTK_VWarn(0, __LINE__, __FILE__, CCTK_THORNSTRING, "Error getting the index of variable: %s\n", metric_vars[i]); + } + + s->coord_system = CCTK_CoordSystemHandle("cart3d"); + if (s->coord_system < 0) + CCTK_WARN(0, "Error getting the coordinate system"); + + s->interp_operator = CCTK_InterpHandle("Lagrange polynomial interpolation (tensor product)"); + if (s->interp_operator < 0) + CCTK_WARN(0, "Error getting the interpolation operator"); + + s->interp_params = Util_TableCreateFromString("order=4 want_global_mode=1"); + if (s->interp_params < 0) + CCTK_WARN(0, "Error creating interpolation parameters table"); + + ret = Util_TableSetIntArray(s->interp_params, NB_INTERP_VARS, + interp_operation_codes, "operation_codes"); + if (ret < 0) + CCTK_WARN(0, "Error setting operation codes"); + + ret = Util_TableSetIntArray(s->interp_params, NB_INTERP_VARS, + interp_operation_indices, "operand_indices"); + if (ret < 0) + CCTK_WARN(0, "Error setting operand indices"); + + CCTK_TimerCreate("MinimalDistortion_Solve"); + CCTK_TimerCreate("MinimalDistortion_Expand"); + CCTK_TimerCreate("MinimalDistortion_interp_geometry"); + CCTK_TimerCreate("MinimalDistortion_calc_eq_coeffs"); + CCTK_TimerCreate("MinimalDistortion_construct_matrix"); + CCTK_TimerCreate("MinimalDistortion_solve_LU"); + CCTK_TimerCreate("MinimalDistortion_solve_BiCGSTAB"); + + *pctx = ctx; + return 0; +fail: + md_solver_free(&ctx); + return -ENOMEM; +} + +void md_solver_free(MDSolver **pctx) +{ + MDSolver *ctx = *pctx; + + if (!ctx) + return; + + if (ctx->priv) { + for (int i = 0; i < ARRAY_ELEMS(ctx->priv->interp_coords); i++) + free(ctx->priv->interp_coords[i]); + for (int i = 0; i < ARRAY_ELEMS(ctx->priv->interp_values); i++) + free(ctx->priv->interp_values[i]); + free(ctx->priv->rhs); + free(ctx->priv->coeff_scale); + + for (int i = 0; i < ctx->nb_equations; i++) { + MDEquationContext *eq_ctx = &ctx->priv->eqs[i]; + for (int j = 0; j < ARRAY_ELEMS(eq_ctx->interp_coords); j++) + free(eq_ctx->interp_coords[j]); + for (int j = 0; j < ARRAY_ELEMS(eq_ctx->interp_values); j++) + free(eq_ctx->interp_values[j]); + + if (eq_ctx->eq_coeffs) { + for (int j = 0; j < ctx->nb_equations; j++) + for (int k = 0; k < ARRAY_ELEMS(eq_ctx->eq_coeffs[j]); k++) + free(eq_ctx->eq_coeffs[j][k]); + } + free(eq_ctx->eq_coeffs); + } + free(ctx->priv->eqs); + + md_pssolve_context_free(&ctx->priv->ps_ctx); + + md_threadpool_free(&ctx->priv->tp_internal); + +#if HAVE_OPENCL + if (ctx->priv->ocl_queue) + clReleaseCommandQueue(ctx->priv->ocl_queue); + if (ctx->priv->ocl_ctx) + clReleaseContext(ctx->priv->ocl_ctx); +#endif + } + + free(ctx->priv); + + free(ctx->coeffs); + + free(ctx); + *pctx = NULL; +} diff --git a/src/md_solve.h b/src/md_solve.h new file mode 100644 index 0000000..07d313a --- /dev/null +++ b/src/md_solve.h @@ -0,0 +1,58 @@ +/* + * Quasimaximal slicing -- actual solver code + * Copyright (C) 2016 Anton Khirnov <anton@khirnov.net> + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef MD_SOLVE_H +#define MD_SOLVE_H + +#include "common.h" + +#include "cctk.h" + +#include "basis.h" +#include "threadpool.h" + +typedef struct MDSolverPriv MDSolverPriv; + +typedef struct MDSolver { + MDSolverPriv *priv; + + unsigned int nb_equations; + + MDBasisSetContext *basis[2][2]; + + int nb_coeffs[2]; + int nb_colloc_points[2]; + + double *coeffs; + + ThreadPoolContext *tp; +} MDSolver; + +int md_solver_init(MDSolver **ctx, + cGH *cctkGH, ThreadPoolContext *tp, + unsigned int nb_equations, + unsigned int (*basis_order)[2], + double sf, double filter_power, double input_filter_power); + +void md_solver_free(MDSolver **ctx); + +int md_solver_solve(MDSolver *ctx); + +void md_solver_print_stats(MDSolver *ctx); + +#endif /* MD_SOLVE_H */ diff --git a/src/md_solve_template.c b/src/md_solve_template.c new file mode 100644 index 0000000..260405e --- /dev/null +++ b/src/md_solve_template.c @@ -0,0 +1,577 @@ +/* + * Minimal distortion -- template for the equations definitions + * Copyright (C) 2016 Anton Khirnov <anton@khirnov.net> + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#define FUNC3(a, b) a ## _ ## b +#define FUNC2(a, b) FUNC3(a, b) +#define FUNC(name) FUNC2(name, EQUATION) + +/** + * A template for calculating the equation coefficients. + */ +static void FUNC(calc_eq_coeffs)(void *arg, + unsigned int job_idx, unsigned int nb_jobs, + unsigned int thread_idx, unsigned int nb_threads) +{ + const MDCalcEqThread *et = arg; + const MDSolver *ctx = et->ctx; + MDEquationContext *eq_ctx = et->eq_ctx; + + const int start = job_idx * et->block_size; + const int end = MIN((job_idx + 1) * et->block_size, NB_COLLOC_POINTS(ctx)); + + for (int i = start; i < end; i++) { + const double x = eq_ctx->interp_coords[0][i]; + const double z = eq_ctx->interp_coords[2][i]; + const int zaxis = x <= EPS; + + double c1o3 = (1.0 / 3.0); + + double gtu[3][3], g[3][3], gu[3][3]; + double dg[3][3][3], d2g[3][3][3][3], dgu[3][3][3], G[3][3][3], dG[3][3][3][3]; + double A[3][3], Au[3][3]; + double dA[3][3][3], dAu[3][3][3]; + double Ric[3][3], Ricm[3][3]; + double rhs_x, rhs_z; + + const double gtxx = eq_ctx->interp_values[I_GTXX][i]; + const double gtyy = eq_ctx->interp_values[I_GTYY][i]; + const double gtzz = eq_ctx->interp_values[I_GTZZ][i]; + const double gtxy = eq_ctx->interp_values[I_GTXY][i]; + const double gtxz = eq_ctx->interp_values[I_GTXZ][i]; + const double gtyz = eq_ctx->interp_values[I_GTYZ][i]; + + const double gt[3][3] = {{ gtxx, gtxy, gtxz }, + { gtxy, gtyy, gtyz }, + { gtxz, gtyz, gtzz }}; + + const double dx_gt11 = eq_ctx->interp_values[I_GTXX_DX][i]; + const double dx_gt22 = eq_ctx->interp_values[I_GTYY_DX][i]; + const double dx_gt33 = eq_ctx->interp_values[I_GTZZ_DX][i]; + const double dx_gt13 = eq_ctx->interp_values[I_GTXZ_DX][i]; + + const double dz_gt11 = eq_ctx->interp_values[I_GTXX_DZ][i]; + const double dz_gt22 = eq_ctx->interp_values[I_GTYY_DZ][i]; + const double dz_gt33 = eq_ctx->interp_values[I_GTZZ_DZ][i]; + const double dz_gt13 = eq_ctx->interp_values[I_GTXZ_DZ][i]; + + const double dgt[3][3][3] = { + { + { dx_gt11, 0.0, dx_gt13 }, + { 0.0, dx_gt22, 0.0 }, + { dx_gt13, 0.0, dx_gt33 }, + }, + { + { 0.0, zaxis ? dx_gt11 - dx_gt22 : (gtxx - gtyy) / x, 0.0 }, + { zaxis ? dx_gt11 - dx_gt22 : (gtxx - gtyy) / x, 0.0, zaxis ? dx_gt13 : gtxz / x }, + { 0.0, zaxis ? dx_gt13 : gtxz / x, 0.0 }, + }, + { + { dz_gt11, 0.0, dz_gt13 }, + { 0.0, dz_gt22, 0.0 }, + { dz_gt13, 0.0, dz_gt33 }, + }, + }; + + const double dxx_gt11 = eq_ctx->interp_values[I_GTXX_DXX][i]; + const double dxx_gt22 = eq_ctx->interp_values[I_GTYY_DXX][i]; + const double dxx_gt33 = eq_ctx->interp_values[I_GTZZ_DXX][i]; + const double dxx_gt13 = eq_ctx->interp_values[I_GTXZ_DXX][i]; + + const double dxz_gt11 = eq_ctx->interp_values[I_GTXX_DXZ][i]; + const double dxz_gt22 = eq_ctx->interp_values[I_GTYY_DXZ][i]; + const double dxz_gt33 = eq_ctx->interp_values[I_GTZZ_DXZ][i]; + const double dxz_gt13 = eq_ctx->interp_values[I_GTXZ_DXZ][i]; + + const double dzz_gt11 = eq_ctx->interp_values[I_GTXX_DZZ][i]; + const double dzz_gt22 = eq_ctx->interp_values[I_GTYY_DZZ][i]; + const double dzz_gt33 = eq_ctx->interp_values[I_GTZZ_DZZ][i]; + const double dzz_gt13 = eq_ctx->interp_values[I_GTXZ_DZZ][i]; + + const double d2gt[3][3][3][3] = { + { + { + { dxx_gt11, 0.0, dxx_gt13 }, + { 0.0, dxx_gt22, 0.0 }, + { dxx_gt13, 0.0, dxx_gt33 }, + }, + { + { 0.0, zaxis ? 0.5 * (dxx_gt11 - dxx_gt22) : (dx_gt11 - dx_gt22) / x - (gtxx - gtyy) / SQR(x), 0.0 }, + { zaxis ? 0.5 * (dxx_gt11 - dxx_gt22) : (dx_gt11 - dx_gt22) / x - (gtxx - gtyy) / SQR(x), 0.0, + zaxis ? 0.5 * dxx_gt13 : dx_gt13 / x - gtxz / SQR(x) }, + { 0.0, zaxis ? 0.5 * dxx_gt13 : dx_gt13 / x - gtxz / SQR(x), 0.0 }, + }, + { + { dxz_gt11, 0.0, dxz_gt13 }, + { 0.0, dxz_gt22, 0.0 }, + { dxz_gt13, 0.0, dxz_gt33 }, + }, + + }, + { + { + { 0.0, zaxis ? 0.5 * (dxx_gt11 - dxx_gt22) : (dx_gt11 - dx_gt22) / x - (gtxx - gtyy) / SQR(x), 0.0 }, + { zaxis ? 0.5 * (dxx_gt11 - dxx_gt22) : (dx_gt11 - dx_gt22) / x - (gtxx - gtyy) / SQR(x), 0.0, + zaxis ? 0.5 * dxx_gt13 : dx_gt13 / x - gtxz / SQR(x) }, + { 0.0, zaxis ? 0.5 * dxx_gt13 : dx_gt13 / x - gtxz / SQR(x), 0.0 }, + }, + { + { zaxis ? dxx_gt22 : dx_gt11 / x - 2 * (gtxx - gtyy) / SQR(x), 0.0, + zaxis ? 0.5 * dxx_gt13 : dx_gt13 / x - gtxz / SQR(x) }, + { 0.0, zaxis ? dxx_gt11 : dx_gt22 / x + 2.0 * (gtxx - gtyy) / SQR(x), 0.0 }, + { zaxis ? 0.5 * dxx_gt13 : dx_gt13 / x - gtxz / SQR(x), 0.0, zaxis ? dxx_gt33 : dx_gt33 / x }, + }, + { + { 0.0, zaxis ? dxz_gt11 - dxz_gt22 : (dz_gt11 - dz_gt22) / x, 0.0 }, + { zaxis ? dxz_gt11 - dxz_gt22 : (dz_gt11 - dz_gt22) / x, 0.0, + zaxis ? dxz_gt13 : dz_gt13 / x }, + { 0.0, zaxis ? dxz_gt13 : dz_gt13 / x, 0.0 }, + }, + + }, + { + { + { dxz_gt11, 0.0, dxz_gt13 }, + { 0.0, dxz_gt22, 0.0 }, + { dxz_gt13, 0.0, dxz_gt33 }, + }, + { + { 0.0, zaxis ? dxz_gt11 - dxz_gt22 : (dz_gt11 - dz_gt22) / x, 0.0 }, + { zaxis ? dxz_gt11 - dxz_gt22 : (dz_gt11 - dz_gt22) / x, 0.0, + zaxis ? dxz_gt13 : dz_gt13 / x }, + { 0.0, zaxis ? dxz_gt13 : dz_gt13 / x, 0.0 }, + }, + { + { dzz_gt11, 0.0, dzz_gt13 }, + { 0.0, dzz_gt22, 0.0 }, + { dzz_gt13, 0.0, dzz_gt33 }, + }, + + }, + }; + + const double Atxx = eq_ctx->interp_values[I_ATXX][i]; + const double Atyy = eq_ctx->interp_values[I_ATYY][i]; + const double Atzz = eq_ctx->interp_values[I_ATZZ][i]; + const double Atxy = eq_ctx->interp_values[I_ATXY][i]; + const double Atxz = eq_ctx->interp_values[I_ATXZ][i]; + const double Atyz = eq_ctx->interp_values[I_ATYZ][i]; + + const double dx_At11 = eq_ctx->interp_values[I_ATXX_DX][i]; + const double dx_At22 = eq_ctx->interp_values[I_ATYY_DX][i]; + const double dx_At33 = eq_ctx->interp_values[I_ATZZ_DX][i]; + const double dx_At13 = eq_ctx->interp_values[I_ATXZ_DX][i]; + + const double dz_At11 = eq_ctx->interp_values[I_ATXX_DZ][i]; + const double dz_At22 = eq_ctx->interp_values[I_ATYY_DZ][i]; + const double dz_At33 = eq_ctx->interp_values[I_ATZZ_DZ][i]; + const double dz_At13 = eq_ctx->interp_values[I_ATXZ_DZ][i]; + + const double dAt[3][3][3] = { + { + { dx_At11, 0.0, dx_At13 }, + { 0.0, dx_At22, 0.0 }, + { dx_At13, 0.0, dx_At33 }, + }, + { + { 0.0, zaxis ? dx_At11 - dx_At22 : (Atxx - Atyy) / x, 0.0 }, + { zaxis ? dx_At11 - dx_At22 : (Atxx - Atyy) / x, 0.0, zaxis ? dx_At13 : Atxz / x }, + { 0.0, zaxis ? dx_At13 : Atxz / x, 0.0 }, + }, + { + { dz_At11, 0.0, dz_At13 }, + { 0.0, dz_At22, 0.0 }, + { dz_At13, 0.0, dz_At33 }, + }, + }; + + const double phi = eq_ctx->interp_values[I_PHI][i]; + + const double phi_dx = eq_ctx->interp_values[I_PHI_DX][i]; + const double phi_dz = eq_ctx->interp_values[I_PHI_DZ][i]; + + const double dphi[3] = { phi_dx, 0.0, phi_dz }; + + const double phi_dxx = eq_ctx->interp_values[I_PHI_DXX][i]; + const double phi_dzz = eq_ctx->interp_values[I_PHI_DZZ][i]; + const double phi_dxz = eq_ctx->interp_values[I_PHI_DXZ][i]; + + const double d2phi[3][3] = { + { phi_dxx, 0.0, phi_dxz }, + { 0.0, zaxis ? phi_dxx : phi_dx / x, 0.0 }, + { phi_dxz, 0.0, phi_dzz }, + }; + + const double At[3][3] = {{ Atxx, Atxy, Atxz }, + { Atxy, Atyy, Atyz }, + { Atxz, Atyz, Atzz }}; + + const double alpha = eq_ctx->interp_values[I_ALPHA][i]; + const double dx_alpha = eq_ctx->interp_values[I_ALPHA_DX][i]; + const double dz_alpha = eq_ctx->interp_values[I_ALPHA_DZ][i]; + + const double dalpha[3] = { dx_alpha, 0.0, dz_alpha }; + + const double Xtx = eq_ctx->interp_values[I_XTX][i]; + const double Xtz = eq_ctx->interp_values[I_XTZ][i]; + + const double det = gtxx * gtyy * gtzz + 2 * gtxy * gtyz * gtxz - gtzz * SQR(gtxy) - SQR(gtxz) * gtyy - gtxx * SQR(gtyz); + + // \tilde{γ}^{ij} + gtu[0][0] = (gtyy * gtzz - SQR(gtyz)) / det; + gtu[1][1] = (gtxx * gtzz - SQR(gtxz)) / det; + gtu[2][2] = (gtxx * gtyy - SQR(gtxy)) / det; + gtu[0][1] = -(gtxy * gtzz - gtyz * gtxz) / det; + gtu[0][2] = (gtxy * gtyz - gtyy * gtxz) / det; + gtu[1][2] = -(gtxx * gtyz - gtxy * gtxz) / det; + gtu[1][0] = gtu[0][1]; + gtu[2][0] = gtu[0][2]; + gtu[2][1] = gtu[1][2]; + + // γ_{jk}/^{jk} + for (int j = 0; j < 3; j++) + for (int k = 0; k < 3; k++) { + gu[j][k] = SQR(phi) * gtu[j][k]; + g[j][k] = gt[j][k] / SQR(phi); + } + + // ∂_j γ_{kl} + for (int j = 0; j < 3; j++) + for (int k = 0; k < 3; k++) + for (int l = 0; l < 3; l++) { + dg[j][k][l] = -2.0 * dphi[j] * gt[k][l] / (phi * SQR(phi)) + dgt[j][k][l] / SQR(phi); + dA[j][k][l] = -2.0 * dphi[j] * At[k][l] / (phi * SQR(phi)) + dAt[j][k][l] / SQR(phi); + } + + // ∂_j γ^{kl} + for (int j = 0; j < 3; j++) + for (int k = 0; k < 3; k++) + for (int l = 0; l < 3; l++) { + double val = 0.0; + for (int m = 0; m < 3; m++) + for (int n = 0; n < 3; n++) + val += -gu[k][m] * gu[l][n] * dg[j][m][n]; + dgu[j][k][l] = val; + } + + // ∂_{jk} g_{lm} + for (int j = 0; j < 3; j++) + for (int k = 0; k < 3; k++) + for (int l = 0; l < 3; l++) + for (int m = 0; m < 3; m++) { + d2g[j][k][l][m] = 6.0 * gt [l][m] * dphi[j] * dphi[k] / SQR(SQR(phi)) - + 2.0 * gt [l][m] * d2phi[j][k] / (phi * SQR(phi)) - + 2.0 * dgt [j][l][m] * dphi[k] / (phi * SQR(phi)) - + 2.0 * dgt [k][l][m] * dphi[j] / (phi * SQR(phi)) + + d2gt[j][k][l][m] / SQR(phi); + } + + // Γ^j_{kl} + for (int j = 0; j < 3; j++) + for (int k = 0; k < 3; k++) + for (int l = 0; l < 3; l++) { + double val = 0.0; + for (int m = 0; m < 3; m++) + val += 0.5 * gu[j][m] * (dg[k][l][m] + dg[l][k][m] - dg[m][k][l]); + G[j][k][l] = val; + } + + // ∂_j Γ^k_{lm} + for (int j = 0; j < 3; j++) + for (int k = 0; k < 3; k++) + for (int l = 0; l < 3; l++) + for (int m = 0; m < 3; m++) { + double val = 0.0; + for (int n = 0; n < 3; n++) { + val += dgu[j][k][n] * (dg [l][m][n] + dg [m][l][n] - dg [n][l][m]) + + gu [k][n] * (d2g[j][l][m][n] + d2g[j][m][l][n] - d2g[j][n][l][m]); + } + dG[j][k][l][m] = 0.5 * val; + } + + // Ric_{jk} + for (int j = 0; j < 3; j++) + for (int k = 0; k < 3; k++) { + double val = 0.0; + for (int m = 0; m < 3; m++) + val += dG[m][m][j][k] - dG[k][m][j][m]; + for (int m = 0; m < 3; m++) + for (int l = 0; l < 3; l++) + val += G[l][l][m] * G[m][j][k] - G[l][k][m] * G[m][j][l]; + Ric[j][k] = val; + } + + // Ric^j_k + for (int j = 0; j < 3; j++) + for (int k = 0; k < 3; k++) { + double val = 0.0; + for (int l = 0; l < 3; l++) + val += gu[j][l] * Ric[l][k]; + Ricm[j][k] = val; + } + + // A_{jk} + for (int j = 0; j < 3; j++) + for (int k = 0; k < 3; k++) { + A[j][k] = At[j][k] / SQR(phi); + } + + // d_j A^{kl} + for (int j = 0; j < 3; j++) + for (int k = 0; k < 3; k++) + for (int l = 0; l < 3; l++) { + double val = 0.0; + for (int m = 0; m < 3; m++) + for (int n = 0; n < 3; n++) + val += dgu[j][k][m] * gu[l][n] * A[m][n] + gu[k][m] * dgu[j][l][n] * A[m][n] + gu[k][m] * gu[l][n] * dA[j][m][n]; + dAu[j][k][l] = val; + } + + // A^{jk} + for (int j = 0; j < 3; j++) + for (int k = 0; k < 3; k++) { + double val = 0.0; + for (int l = 0; l < 3; l++) + for (int m = 0; m < 3; m++) + val += gu[j][l] * gu[k][m] * A[l][m]; + Au[j][k] = val; + } + + rhs_x = 0.0; + rhs_z = 0.0; + for (int j = 0; j < 3; j++) { + rhs_x += dalpha[j] * Au[0][j]; + rhs_z += dalpha[j] * Au[2][j]; + } + for (int j = 0; j < 3; j++) { + rhs_x += alpha * dAu[j][0][j]; + rhs_z += alpha * dAu[j][2][j]; + } + for (int j = 0; j < 3; j++) { + double val_x = 0.0; + double val_z = 0.0; + for (int k = 0; k < 3; k++) { + val_x += G[0][j][k] * Au[k][j]; + val_z += G[2][j][k] * Au[k][j]; + } + rhs_x += val_x * alpha; + rhs_z += val_z * alpha; + } + for (int j = 0; j < 3; j++) { + double val_x = 0.0; + double val_z = 0.0; + for (int k = 0; k < 3; k++) { + val_x += G[j][j][k] * Au[0][k]; + val_z += G[j][j][k] * Au[2][k]; + } + rhs_x += val_x * alpha; + rhs_z += val_z * alpha; + } + + rhs_x *= 2.0; + rhs_z *= 2.0; + + double X[3] = { 0.0 }; + for (int j = 0; j < 3; j++) + for (int k = 0; k < 3; k++) { + X[0] += gu[j][k] * G[0][j][k]; + X[2] += gu[j][k] * G[2][j][k]; + } + + if (EQUATION == 0) { + /* eq 0 */ + /* ∂_{xx}β^x */ + eq_ctx->eq_coeffs[0][PSSOLVE_DIFF_ORDER_20][i] = gu[0][0] + c1o3 * gu[0][0] + (zaxis ? 0.5 * (gu[1][1] + c1o3 * gu[0][0]) : 0.0); + /* ∂_{xx}β^z */ + eq_ctx->eq_coeffs[1][PSSOLVE_DIFF_ORDER_20][i] = 0.0; + /* ∂_{zz}β^x */ + eq_ctx->eq_coeffs[0][PSSOLVE_DIFF_ORDER_02][i] = gu[2][2]; + /* ∂_{zz}β^z */ + eq_ctx->eq_coeffs[1][PSSOLVE_DIFF_ORDER_02][i] = c1o3 * gu[0][2]; + + /* ∂_{xz}β^x */ + eq_ctx->eq_coeffs[0][PSSOLVE_DIFF_ORDER_11][i] = 2.0 * gu[0][2] + c1o3 * gu[0][2] + (zaxis ? c1o3 * gu[0][2] : 0.0); + /* ∂_{xz}β^z */ + eq_ctx->eq_coeffs[1][PSSOLVE_DIFF_ORDER_11][i] = c1o3 * gu[0][0]; + + /* ∂_{x}β^x */ + { + double t0 = 0.0; + double t1 = 0.0; + for (int j = 0; j < 3; j++) { + t0 += gu[0][j] * G[0][j][0]; + t1 += G[j][j][0]; + } + eq_ctx->eq_coeffs[0][PSSOLVE_DIFF_ORDER_10][i] = 2.0 * t0 - X[0] + c1o3 * gu[0][0] * t1 + (zaxis ? 2.0 * gu[1][1] * G[0][1][1] : (gu[1][1] + c1o3 * gu[0][0]) / x); + } + /* ∂_{x}β^z */ + { + double t0 = 0.0; + double t1 = 0.0; + for (int j = 0; j < 3; j++) { + t0 += gu[0][j] * G[0][j][2]; + t1 += G[j][j][2]; + } + eq_ctx->eq_coeffs[1][PSSOLVE_DIFF_ORDER_10][i] = 2.0 * t0 + c1o3 * gu[0][0] * t1; + } + + /* ∂_{z}β^x */ + { + double t0 = 0.0; + double t1 = 0.0; + for (int j = 0; j < 3; j++) { + t0 += gu[2][j] * G[0][j][0]; + t1 += G[j][j][0]; + } + eq_ctx->eq_coeffs[0][PSSOLVE_DIFF_ORDER_01][i] = 2.0 * t0 - X[2] + c1o3 * gu[0][2] * t1 + (zaxis ? 0.0 : c1o3 * gu[0][2] / x); + } + /* ∂_{z}β^z */ + { + double t0 = 0.0; + double t1 = 0.0; + for (int j = 0; j < 3; j++) { + t0 += gu[2][j] * G[0][j][2]; + t1 += G[j][j][2]; + } + eq_ctx->eq_coeffs[1][PSSOLVE_DIFF_ORDER_01][i] = 2.0 * t0 + c1o3 * gu[0][2] * t1; + } + + /* β^x */ + { + double t0 = 0.0; + double t1 = 0.0; + for (int k = 0; k < 3; k++) { + for (int l = 0; l < 3; l++) { + double val = 0.0; + for (int j = 0; j < 3; j++) + val += G[0][k][j] * G[j][l][0] - G[j][k][l] * G[0][0][j]; + t0 += gu[k][l] * (dG[k][0][l][0] + val); + t1 += gu[0][k] * dG[k][l][l][0]; + } + } + eq_ctx->eq_coeffs[0][PSSOLVE_DIFF_ORDER_00][i] = t0 + c1o3 * t1 + Ricm[0][0] + (zaxis ? 0.0 : 2.0 * gu[1][1] * G[0][1][1] / x - (gu[1][1] + c1o3 * gu[0][0]) / SQR(x)); + } + + /* β^z */ + { + double t0 = 0.0; + double t1 = 0.0; + for (int k = 0; k < 3; k++) { + for (int l = 0; l < 3; l++) { + double val = 0.0; + for (int j = 0; j < 3; j++) + val += G[0][k][j] * G[j][l][2] - G[j][k][l] * G[0][2][j]; + t0 += gu[k][l] * (dG[k][0][l][2] + val); + t1 += gu[0][k] * dG[k][l][l][2]; + } + } + eq_ctx->eq_coeffs[1][PSSOLVE_DIFF_ORDER_00][i] = t0 + c1o3 * t1 + Ricm[0][2]; + } + + eq_ctx->rhs[i] = rhs_x; + } else { + /* eq 1 */ + /* ∂_{xx}β^x */ + eq_ctx->eq_coeffs[0][PSSOLVE_DIFF_ORDER_20][i] = c1o3 * gu[2][0] + (zaxis ? c1o3 * 0.5 * gu[2][0] : 0.0); + /* ∂_{xx}β^z */ + eq_ctx->eq_coeffs[1][PSSOLVE_DIFF_ORDER_20][i] = gu[0][0] + (zaxis ? gu[1][1] : 0.0); + /* ∂_{zz}β^x */ + eq_ctx->eq_coeffs[0][PSSOLVE_DIFF_ORDER_02][i] = 0.0; + /* ∂_{zz}β^z */ + eq_ctx->eq_coeffs[1][PSSOLVE_DIFF_ORDER_02][i] = gu[2][2] + c1o3 * gu[2][2]; + /* ∂_{xz}β^x */ + eq_ctx->eq_coeffs[0][PSSOLVE_DIFF_ORDER_11][i] = c1o3 * gu[2][2] + (zaxis ? c1o3 * gu[2][2] : 0.0); + /* ∂_{xz}β^z */ + eq_ctx->eq_coeffs[1][PSSOLVE_DIFF_ORDER_11][i] = 2.0 * gu[0][2] + c1o3 * gu[0][2]; + + /* ∂_{x}β^x */ + { + double t0 = 0.0; + double t1 = 0.0; + for (int j = 0; j < 3; j++) { + t0 += gu[0][j] * G[2][j][0]; + t1 += G[j][j][0]; + } + eq_ctx->eq_coeffs[0][PSSOLVE_DIFF_ORDER_10][i] = 2.0 * t0 + c1o3 * gu[2][0] * t1 + (zaxis ? 2.0 * gu[1][1] * G[2][1][1] : c1o3 * gu[2][0] / x); + } + /* ∂_{x}β^z */ + { + double t0 = 0.0; + double t1 = 0.0; + for (int j = 0; j < 3; j++) { + t0 += gu[0][j] * G[2][j][2]; + t1 += G[j][j][2]; + } + eq_ctx->eq_coeffs[1][PSSOLVE_DIFF_ORDER_10][i] = 2.0 * t0 - X[0] + c1o3 * gu[2][0] * t1 + (zaxis ? 0.0 : gu[1][1] / x); + } + /* ∂_{z}β^x */ + { + double t0 = 0.0; + double t1 = 0.0; + for (int j = 0; j < 3; j++) { + t0 += gu[2][j] * G[2][j][0]; + t1 += G[j][j][0]; + } + eq_ctx->eq_coeffs[0][PSSOLVE_DIFF_ORDER_01][i] = 2.0 * t0 + c1o3 * gu[2][2] * t1 + (zaxis ? 0.0 : c1o3 * gu[2][2] / x); + } + /* ∂_{z}β^z */ + { + double t0 = 0.0; + double t1 = 0.0; + for (int j = 0; j < 3; j++) { + t0 += gu[2][j] * G[2][j][2]; + t1 += G[j][j][2]; + } + eq_ctx->eq_coeffs[1][PSSOLVE_DIFF_ORDER_01][i] = 2.0 * t0 - X[2] + c1o3 * gu[2][2] * t1; + } + + /* β^x */ + { + double t0 = 0.0; + double t1 = 0.0; + for (int k = 0; k < 3; k++) { + for (int l = 0; l < 3; l++) { + double val = 0.0; + for (int j = 0; j < 3; j++) + val += G[2][k][j] * G[j][l][0] - G[j][k][l] * G[2][0][j]; + t0 += gu[k][l] * (dG[k][2][l][0] + val); + t1 += gu[2][k] * dG[k][l][l][0]; + } + } + eq_ctx->eq_coeffs[0][PSSOLVE_DIFF_ORDER_00][i] = t0 + c1o3 * t1 + Ricm[2][0] + (zaxis ? 0.0 : 2.0 * gu[1][1] * G[2][1][1] / x - c1o3 * gu[2][0] / SQR(x)); + } + + /* β^z */ + { + double t0 = 0.0; + double t1 = 0.0; + for (int k = 0; k < 3; k++) { + for (int l = 0; l < 3; l++) { + double val = 0.0; + for (int j = 0; j < 3; j++) + val += G[2][k][j] * G[j][l][2] - G[j][k][l] * G[2][2][j]; + t0 += gu[k][l] * (dG[k][2][l][2] + val); + t1 += gu[2][k] * dG[k][l][l][2]; + } + } + eq_ctx->eq_coeffs[1][PSSOLVE_DIFF_ORDER_00][i] = t0 + c1o3 * t1 + Ricm[2][2]; + } + + eq_ctx->rhs[i] = rhs_z; + } + } +} diff --git a/src/pssolve.c b/src/pssolve.c new file mode 100644 index 0000000..1f5bb44 --- /dev/null +++ b/src/pssolve.c @@ -0,0 +1,498 @@ +/* + * Pseudospectral 2nd order 2D linear PDE solver + * Copyright (C) 2016 Anton Khirnov <anton@khirnov.net> + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#include <errno.h> +#include <inttypes.h> +#include <limits.h> +#include <math.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include <cblas.h> +#include <lapacke.h> + +#include "bicgstab.h" +#include "pssolve.h" +#include "threadpool.h" + +#define NB_COEFFS(eq_ctx) ((eq_ctx)->nb_coeffs[0] * (eq_ctx)->nb_coeffs[1]) +#define NB_COLLOC_POINTS(eq_ctx) ((eq_ctx)->nb_colloc_points[0] * (eq_ctx)->nb_colloc_points[1]) + +typedef struct PSEquationContext { + size_t nb_coeffs[2]; + size_t nb_colloc_points[2]; + size_t colloc_grid_order[2]; + + double *(*basis_val)[PSSOLVE_DIFF_ORDER_NB]; + double *mat; +} PSEquationContext; + +struct PSSolvePriv { + BiCGStabContext *bicgstab; + int steps_since_inverse; + + size_t nb_coeffs; + + PSEquationContext *eqs; + + int *ipiv; + double *mat; + + ThreadPoolContext *tp; + ThreadPoolContext *tp_internal; +}; + +typedef struct ConstructMatrixThread { + const PSEquationContext *eq_ctx; + const double **eq_coeffs; + double *mat; + ptrdiff_t mat_stride; + unsigned int var_idx; +} ConstructMatrixThread; + +static void construct_matrix(void *arg, + unsigned int job_idx, unsigned int nb_jobs, + unsigned int thread_idx, unsigned int nb_threads) +{ + ConstructMatrixThread *cmt = arg; + const PSEquationContext *eq_ctx = cmt->eq_ctx; + const double **eq_coeffs = cmt->eq_coeffs; + double *mat = cmt->mat; + ptrdiff_t mat_stride = cmt->mat_stride; + unsigned int var_idx = cmt->var_idx; + unsigned int idx_coeff = job_idx; + + for (int idx_grid = 0; idx_grid < NB_COLLOC_POINTS(eq_ctx); idx_grid++) { + const int idx = idx_grid + NB_COLLOC_POINTS(eq_ctx) * idx_coeff; + double val = 0.0; + + for (int i = 0; i < PSSOLVE_DIFF_ORDER_NB; i++) + val += eq_coeffs[i][idx_grid] * eq_ctx->basis_val[var_idx][i][idx]; + + mat[idx_grid + mat_stride * idx_coeff] = val; + } +} + +static int lu_invert(const int N, double *mat, double *rhs, int *ipiv) +{ + char equed = 'N'; + double cond, ferr, berr, rpivot; + + double *mat_f, *x; + int ret = 0; + +#if 0 + LAPACKE_dgesv(LAPACK_COL_MAJOR, N, 1, + mat, N, ipiv, rhs, N); + LAPACKE_dgetri(LAPACK_COL_MAJOR, N, mat, N, ipiv); +#else + mat_f = malloc(SQR(N) * sizeof(*mat_f)); + x = malloc(N * sizeof(*x)); + + //{ + // int i, j; + // for (i = 0; i < N; i++) { + // for (j = 0; j < N; j++) + // fprintf(stderr, "%+#010.8g\t", mat[i + j * N]); + // fprintf(stderr, "\n"); + // } + //} + //{ + // double *mat_copy = malloc(SQR(N) * sizeof(double)); + // double *svd = malloc(N * sizeof(double)); + // double *rhs_copy = malloc(N * sizeof(double)); + // int rank; + + // memcpy(mat_copy, mat, SQR(N) * sizeof(double)); + // memcpy(rhs_copy, rhs, N * sizeof(double)); + + // LAPACKE_dgelsd(LAPACK_COL_MAJOR, N, N, 1, mat_copy, N, rhs_copy, N, + // svd, 1e-13, &rank); + + // free(mat_copy); + // for (int i = 0; i < N; i++) { + // if (i > 5 && i < N - 5) + // continue; + + // fprintf(stderr, "%g\t", svd[i]); + // } + // fprintf(stderr, "\n rank %d\n", rank); + // free(svd); + // free(rhs_copy); + + // if (rank < N) + // ret = 1; + //} + + //LAPACKE_dgesv(LAPACK_COL_MAJOR, N, 1, + // mat, N, ipiv, rhs, N); + LAPACKE_dgesvx(LAPACK_COL_MAJOR, 'N', 'N', N, 1, + mat, N, mat_f, N, ipiv, &equed, NULL, NULL, + rhs, N, x, N, &cond, &ferr, &berr, &rpivot); + LAPACKE_dgetri(LAPACK_COL_MAJOR, N, mat_f, N, ipiv); + memcpy(rhs, x, N * sizeof(double)); + memcpy(mat, mat_f, SQR(N) * sizeof(double)); + + fprintf(stderr, "LU factorization solution to a %zdx%zd matrix: " + "condition number %16.16g; forward error %16.16g backward error %16.16g\n", + N, N, cond, ferr, berr); + + free(mat_f); + free(x); +#endif + + return ret; +} + +int md_pssolve_solve(PSSolveContext *ctx, + const double *(**eq_coeffs)[PSSOLVE_DIFF_ORDER_NB], + const double *rhs, double *coeffs) +{ + PSSolvePriv *s = ctx->priv; + double rhs_max; + int64_t start; + + int ret = 0; + + /* fill the matrix */ + start = gettime(); + + for (int i = 0; i < ctx->nb_equations; i++) { + PSEquationContext *eq_ctx = &s->eqs[i]; + double *mat = s->eqs[i].mat; + + for (int j = 0; j < ctx->nb_equations; j++) { + ConstructMatrixThread thread = { + .eq_ctx = eq_ctx, + .eq_coeffs = eq_coeffs[i][j], + .mat = mat, + .mat_stride = s->nb_coeffs, + .var_idx = j, + }; + md_threadpool_execute(s->tp, NB_COEFFS(&s->eqs[j]), construct_matrix, + &thread); + mat += NB_COEFFS(&s->eqs[j]) * s->nb_coeffs; + } + } + + ctx->construct_matrix_time += gettime() - start; + ctx->construct_matrix_count++; + +#if 0 + if (rhs_max < EPS) { + fprintf(stderr, "zero rhs\n"); + memset(ms->coeffs, 0, sizeof(*ms->coeffs) * ms->nb_coeffs); + if (ms->cl_queue) { + clEnqueueWriteBuffer(ms->cl_queue, ms->ocl_coeffs, 1, 0, N * sizeof(double), + ms->coeffs, 0, NULL, NULL); + } + return 0; + } +#endif + + /* solve for the coeffs */ + if (s->steps_since_inverse < 1024) { + int64_t start; + + start = gettime(); + + CCTK_TimerStart("MinimalDistortion_solve_BiCGSTAB"); + ret = md_bicgstab_solve(s->bicgstab, s->mat, rhs, coeffs); + CCTK_TimerStop("MinimalDistortion_solve_BiCGSTAB"); + + if (ret >= 0) { + ctx->cg_time_total += gettime() - start; + ctx->cg_solve_count++; + ctx->cg_iter_count += ret + 1; + s->steps_since_inverse++; + + } + } else + ret = -1; + + if (ret < 0) { + int64_t start; + + CCTK_TimerStart("MinimalDistortion_solve_LU"); + start = gettime(); + + memcpy(coeffs, rhs, s->nb_coeffs * sizeof(*rhs)); + + ret = lu_invert(s->nb_coeffs, s->mat, coeffs, s->ipiv); + ctx->lu_solves_time += gettime() - start; + ctx->lu_solves_count++; + CCTK_TimerStop("MinimalDistortion_solve_LU"); + + ret = md_bicgstab_init(s->bicgstab, s->mat, coeffs); + + s->steps_since_inverse = 0; + } + + return ret; +} + +static int basis_val_init(PSSolveContext *ctx, unsigned int eq_idx) +{ + PSSolvePriv *s = ctx->priv; + PSEquationContext *eq_ctx = &s->eqs[eq_idx]; + int ret; + + eq_ctx->basis_val = calloc(ctx->nb_equations, sizeof(*eq_ctx->basis_val)); + if (!eq_ctx->basis_val) + return -ENOMEM; + + for (int i = 0; i < ctx->nb_equations; i++) { + double *basis_val[2][3] = { { NULL } }; + + /* for each direction, compute the corresponding basis values/derivatives */ + for (int dir = 0; dir < ARRAY_ELEMS(basis_val); dir++) { + for (int diff_order = 0; diff_order < ARRAY_ELEMS(basis_val[dir]); diff_order++) { + ret = posix_memalign((void**)&basis_val[dir][diff_order], 32, + sizeof(*basis_val[dir][diff_order]) * s->eqs[i].nb_coeffs[dir] * eq_ctx->nb_colloc_points[dir]); + if (ret) { + ret = -ENOMEM; + goto fail; + } + } + + for (int k = 0; k < eq_ctx->nb_colloc_points[dir]; k++) { + double coord = ctx->colloc_grid[eq_idx][dir][k]; + for (int l = 0; l < s->eqs[i].nb_coeffs[dir]; l++) { + basis_val[dir][0][k * s->eqs[i].nb_coeffs[dir] + l] = md_basis_eval(ctx->basis[i][dir], MD_BASIS_EVAL_TYPE_VALUE, coord, l); + basis_val[dir][1][k * s->eqs[i].nb_coeffs[dir] + l] = md_basis_eval(ctx->basis[i][dir], MD_BASIS_EVAL_TYPE_DIFF1, coord, l); + basis_val[dir][2][k * s->eqs[i].nb_coeffs[dir] + l] = md_basis_eval(ctx->basis[i][dir], MD_BASIS_EVAL_TYPE_DIFF2, coord, l); + } + } + } + + for (int diff = 0; diff < ARRAY_ELEMS(eq_ctx->basis_val[i]); diff++) { + ret = posix_memalign((void**)&eq_ctx->basis_val[i][diff], 32, + NB_COLLOC_POINTS(eq_ctx) * NB_COEFFS(eq_ctx) * sizeof(*eq_ctx->basis_val[i][diff])); + if (ret) { + ret = -ENOMEM; + goto fail; + } + } + + for (int j = 0; j < eq_ctx->nb_colloc_points[1]; j++) { + const double *basis1 = basis_val[1][0] + j * s->eqs[i].nb_coeffs[1]; + const double *dbasis1 = basis_val[1][1] + j * s->eqs[i].nb_coeffs[1]; + const double *d2basis1 = basis_val[1][2] + j * s->eqs[i].nb_coeffs[1]; + + for (int k = 0; k < eq_ctx->nb_colloc_points[0]; k++) { + const double *basis0 = basis_val[0][0] + k * s->eqs[i].nb_coeffs[0]; + const double *dbasis0 = basis_val[0][1] + k * s->eqs[i].nb_coeffs[0]; + const double *d2basis0 = basis_val[0][2] + k * s->eqs[i].nb_coeffs[0]; + + const int idx_grid = j * eq_ctx->nb_colloc_points[0] + k; + + for (int l = 0; l < s->eqs[i].nb_coeffs[1]; l++) + for (int m = 0; m < s->eqs[i].nb_coeffs[0]; m++) { + const int idx_coeff = l * s->eqs[i].nb_coeffs[0] + m; + const int idx = idx_grid + NB_COLLOC_POINTS(eq_ctx) * idx_coeff; + + eq_ctx->basis_val[i][PSSOLVE_DIFF_ORDER_00][idx] = basis0[m] * basis1[l]; + eq_ctx->basis_val[i][PSSOLVE_DIFF_ORDER_10][idx] = dbasis0[m] * basis1[l]; + eq_ctx->basis_val[i][PSSOLVE_DIFF_ORDER_01][idx] = basis0[m] * dbasis1[l]; + eq_ctx->basis_val[i][PSSOLVE_DIFF_ORDER_20][idx] = d2basis0[m] * basis1[l]; + eq_ctx->basis_val[i][PSSOLVE_DIFF_ORDER_02][idx] = basis0[m] * d2basis1[l]; + eq_ctx->basis_val[i][PSSOLVE_DIFF_ORDER_11][idx] = dbasis0[m] * dbasis1[l]; + } + } + } + +fail: + for (int dir = 0; dir < ARRAY_ELEMS(basis_val); dir++) + for (int diff = 0; diff < ARRAY_ELEMS(basis_val[dir]); diff++) + free(basis_val[dir][diff]); + if (ret < 0) + return ret; + } + + return 0; +} + +int md_pssolve_context_init(PSSolveContext *ctx) +{ + PSSolvePriv *s = ctx->priv; + size_t N = 0; + + int ret = 0; + + if (ctx->tp) { + s->tp = ctx->tp; + } else { + ret = md_threadpool_init(&s->tp_internal, 1); + if (ret < 0) + return ret; + s->tp = s->tp_internal; + } + + /* sanity check the parameters */ + for (int i = 0; i < ctx->nb_equations; i++) { + if (!ctx->basis[i][0] || !ctx->basis[i][1]) { + fprintf(stderr, "Basis set for variable %d not set\n", i); + return -EINVAL; + } + if (!ctx->solve_order[i][0] || !ctx->solve_order[i][1]) { + fprintf(stderr, "Solver order for variable %d not set\n", i); + return -EINVAL; + } + + N += ctx->solve_order[i][0] * ctx->solve_order[i][1]; + } + + ret = posix_memalign((void**)&s->ipiv, 32, sizeof(*s->ipiv) * N); + ret |= posix_memalign((void**)&s->mat, 32, sizeof(*s->mat) * N * N); + if (ret) + return -ENOMEM; + + s->nb_coeffs = N; + + ctx->colloc_grid = calloc(ctx->nb_equations, sizeof(*ctx->colloc_grid)); + if (!ctx->colloc_grid) + return -ENOMEM; + + /* initialize the per-equation state */ + for (int i = 0; i < ctx->nb_equations; i++) { + PSEquationContext *eq_ctx = &s->eqs[i]; + + eq_ctx->nb_coeffs[0] = ctx->solve_order[i][0]; + eq_ctx->nb_coeffs[1] = ctx->solve_order[i][1]; + eq_ctx->nb_colloc_points[0] = ctx->solve_order[i][0]; + eq_ctx->nb_colloc_points[1] = ctx->solve_order[i][1]; + eq_ctx->colloc_grid_order[0] = ctx->solve_order[i][0]; + eq_ctx->colloc_grid_order[1] = ctx->solve_order[i][1]; + + if (i == 0) + eq_ctx->mat = s->mat; + else + eq_ctx->mat = s->eqs[i - 1].mat + NB_COLLOC_POINTS(&s->eqs[i - 1]); + + /* compute the collocation grid */ + posix_memalign((void**)&ctx->colloc_grid[i][0], 32, eq_ctx->nb_colloc_points[0] * sizeof(*ctx->colloc_grid[i][0])); + posix_memalign((void**)&ctx->colloc_grid[i][1], 32, eq_ctx->nb_colloc_points[1] * sizeof(*ctx->colloc_grid[i][1])); + if (!ctx->colloc_grid[i][0] || !ctx->colloc_grid[i][1]) + return -ENOMEM; + + for (int j = 0; j < eq_ctx->nb_colloc_points[0]; j++) + ctx->colloc_grid[i][0][j] = md_basis_colloc_point(ctx->basis[i][0], eq_ctx->colloc_grid_order[0], j); + for (int j = 0; j < eq_ctx->nb_colloc_points[1]; j++) + ctx->colloc_grid[i][1][j] = md_basis_colloc_point(ctx->basis[i][1], eq_ctx->colloc_grid_order[1], j); + + } + + /* precompute the basis values we will need */ + for (int i = 0; i < ctx->nb_equations; i++) { + ret = basis_val_init(ctx, i); + if (ret < 0) + return ret; + } + + s->steps_since_inverse = INT_MAX; + + /* init the BiCGStab solver */ + ret = md_bicgstab_context_alloc(&s->bicgstab, N, ctx->ocl_ctx, ctx->ocl_queue); + if (ret < 0) + return ret; + + return 0; +} + +int md_pssolve_context_alloc(PSSolveContext **pctx, unsigned int nb_equations) +{ + PSSolveContext *ctx; + + if (!nb_equations) + return -EINVAL; + + ctx = calloc(1, sizeof(*ctx)); + if (!ctx) + return -ENOMEM; + + ctx->nb_equations = nb_equations; + + ctx->priv = calloc(1, sizeof(*ctx->priv)); + if (!ctx->priv) + goto fail; + + ctx->priv->eqs = calloc(nb_equations, sizeof(*ctx->priv->eqs)); + if (!ctx->priv->eqs) + goto fail; + + ctx->basis = calloc(nb_equations, sizeof(*ctx->basis)); + if (!ctx->basis) + goto fail; + + ctx->solve_order = calloc(nb_equations, sizeof(*ctx->solve_order)); + if (!ctx->solve_order) + goto fail; + + *pctx = ctx; + return 0; +fail: + md_pssolve_context_free(&ctx); + return -ENOMEM; +} + +void md_pssolve_context_free(PSSolveContext **pctx) +{ + PSSolveContext *ctx = *pctx; + + if (!ctx) + return; + + if (ctx->priv) { + if (ctx->priv->eqs) { + for (int i = 0; i < ctx->nb_equations; i++) { + PSEquationContext *eq_ctx = &ctx->priv->eqs[i]; + + for (int j = 0; j < ctx->nb_equations; j++) + for (int k = 0; k < ARRAY_ELEMS(eq_ctx->basis_val[j]); k++) + free(eq_ctx->basis_val[j][k]); + free(eq_ctx->basis_val); + } + } + + free(ctx->priv->eqs); + + free(ctx->priv->ipiv); + free(ctx->priv->mat); + + md_bicgstab_context_free(&ctx->priv->bicgstab); + md_threadpool_free(&ctx->priv->tp_internal); + } + + free(ctx->priv); + + if (ctx->colloc_grid) { + for (int i = 0; i < ctx->nb_equations; i++) + for (int j = 0; j < ARRAY_ELEMS(ctx->colloc_grid[i]); j++) + free(ctx->colloc_grid[i][j]); + } + + free(ctx->colloc_grid[0]); + free(ctx->colloc_grid[1]); + + free(ctx->basis); + free(ctx->solve_order); + + free(ctx); + *pctx = NULL; +} diff --git a/src/pssolve.h b/src/pssolve.h new file mode 100644 index 0000000..e6a4c1a --- /dev/null +++ b/src/pssolve.h @@ -0,0 +1,139 @@ +/* + * Pseudospectral 2nd order 2D linear PDE solver + * Copyright (C) 2016 Anton Khirnov <anton@khirnov.net> + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef MD_PSSOLVE_H +#define MD_PSSOLVE_H + +#include "common.h" + +#if HAVE_OPENCL +#include <cl.h> +#else +typedef void* cl_context; +typedef void* cl_command_queue; +#endif + +#include <stdint.h> + +#include "basis.h" +#include "threadpool.h" + +enum PSSolveDiffOrder { + PSSOLVE_DIFF_ORDER_00, + PSSOLVE_DIFF_ORDER_10, + PSSOLVE_DIFF_ORDER_01, + PSSOLVE_DIFF_ORDER_11, + PSSOLVE_DIFF_ORDER_20, + PSSOLVE_DIFF_ORDER_02, + PSSOLVE_DIFF_ORDER_NB, +}; + +typedef struct PSSolvePriv PSSolvePriv; + +typedef struct PSSolveContext { + /** + * Solver private data, not to be touched by the caller. + */ + PSSolvePriv *priv; + + /** + * Number of equations/unknown functions in the set. + * Set by md_pssolve_context_alloc(). + */ + unsigned int nb_equations; + + /** + * The basis sets. + * + * basis[i][j] is the basis set used for i-th variable in j-th direction. + * + * The array is allocated by md_pssolve_context_alloc(), must be filled by + * by the caller before md_pssolve_context_init(). + */ + const MDBasisSetContext *(*basis)[2]; + + /** + * Order of the solver. + * + * solve_order[i][j] is the order of the solver (i.e. the number of the + * basis functions used) for i-th variable in j-th direction. + * + * Allocated by md_pssolve_context_alloc(), must be filled by the caller + * before md_pssolve_context_init(). + */ + unsigned int (*solve_order)[2]; + + /** + * Locations of the collocation points. The equation coefficients passed to + * md_pssolve_solve() should be evaluated at those grid positions. + * + * colloc_grid[i][j] is an array of length solve_order[i][j] and contains + * the collocation points for the i-th variable in the j-th direction. + * + * Set by the solver after md_pssolve_context_init(). + */ + double *(*colloc_grid)[2]; + + /** + * The thread pool used for multithreaded execution. May be set by the + * caller before md_pssolve_context_init(), otherwise a single thread will + * be used. + */ + ThreadPoolContext *tp; + + cl_context ocl_ctx; + cl_command_queue ocl_queue; + + uint64_t lu_solves_count; + uint64_t lu_solves_time; + + uint64_t cg_solve_count; + uint64_t cg_iter_count; + uint64_t cg_time_total; + + uint64_t construct_matrix_count; + uint64_t construct_matrix_time; +} PSSolveContext; + +/** + * Allocate a new solver. + */ +int md_pssolve_context_alloc(PSSolveContext **ctx, unsigned int nb_equations); + +/** + * Initialize the solver for use after all the context options have been set. + */ +int md_pssolve_context_init(PSSolveContext *ctx); + +/** + * Free the solver and all its internal state. + */ +void md_pssolve_context_free(PSSolveContext **ctx); + +/** + * Solve a second order linear PDE in 2D with a pseudospectral method. + * + * @param eq_coeffs the equation coefficients. + * @param rhs the right-hand side of the equation at the collocation points. + * @param coeffs the spectral coefficients of the solution will be written here. + */ +int md_pssolve_solve(PSSolveContext *ctx, + const double *(**eq_coeffs)[PSSOLVE_DIFF_ORDER_NB], + const double *rhs, double *coeffs); + +#endif /* MD_PSSOLVE_H */ diff --git a/src/register.c b/src/register.c new file mode 100644 index 0000000..64b47ce --- /dev/null +++ b/src/register.c @@ -0,0 +1,7 @@ +void minimal_distortion_axi_register_mol(CCTK_ARGUMENTS) +{ + MoLRegisterConstrained(CCTK_VarIndex("ML_BSSN::beta1")); + MoLRegisterConstrained(CCTK_VarIndex("ML_BSSN::beta2")); + MoLRegisterConstrained(CCTK_VarIndex("ML_BSSN::beta3")); +} + diff --git a/src/threadpool.c b/src/threadpool.c new file mode 100644 index 0000000..2febdcb --- /dev/null +++ b/src/threadpool.c @@ -0,0 +1,174 @@ +/* + * Copyright 2016 Anton Khirnov <anton@khirnov.net> + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#include <errno.h> +#include <pthread.h> +#include <stdlib.h> + +#include "threadpool.h" + +typedef struct WorkerContext { + ThreadPoolContext *parent; + pthread_t thread; + unsigned int idx; +} WorkerContext; + +struct ThreadPoolContext { + WorkerContext *workers; + unsigned int nb_workers; + + pthread_mutex_t mutex; + pthread_cond_t cond; + void (*func)(void *arg, + unsigned int job_idx, unsigned int nb_jobs, + unsigned int thread_idx, unsigned int nb_threads); + void *func_arg; + int next_job; + int nb_jobs; + int nb_jobs_finished; + + int finish; +}; + +void *worker_thread(void *arg) +{ + WorkerContext *w = arg; + ThreadPoolContext *ctx = w->parent; + int nb_jobs, job_idx; + + while (1) { + pthread_mutex_lock(&ctx->mutex); + while (!ctx->finish && ctx->next_job >= ctx->nb_jobs) + pthread_cond_wait(&ctx->cond, &ctx->mutex); + + if (ctx->finish) { + pthread_mutex_unlock(&ctx->mutex); + break; + } + + nb_jobs = ctx->nb_jobs; + job_idx = ctx->next_job++; + + pthread_mutex_unlock(&ctx->mutex); + + ctx->func(ctx->func_arg, job_idx, nb_jobs, w->idx, ctx->nb_workers); + + pthread_mutex_lock(&ctx->mutex); + + ctx->nb_jobs_finished++; + + pthread_cond_broadcast(&ctx->cond); + pthread_mutex_unlock(&ctx->mutex); + } + return NULL; +} + +int md_threadpool_init(ThreadPoolContext **pctx, unsigned int nb_threads) +{ + ThreadPoolContext *ctx; + int ret = 0; + + if (!nb_threads) + return -ENOSYS; + + ctx = calloc(1, sizeof(*ctx)); + if (!ctx) + return -ENOMEM; + + pthread_mutex_init(&ctx->mutex, NULL); + pthread_cond_init(&ctx->cond, NULL); + + ctx->workers = calloc(nb_threads, sizeof(*ctx->workers)); + if (!ctx->workers) { + ret = -ENOMEM; + goto fail; + } + + for (int i = 0; i < nb_threads; i++) { + WorkerContext *w = &ctx->workers[i]; + + w->idx = i; + w->parent = ctx; + + ret = pthread_create(&w->thread, NULL, worker_thread, w); + if (ret) { + ret = -ret; + goto fail; + } + + ctx->nb_workers++; + } + + + *pctx = ctx; + return 0; +fail: + md_threadpool_free(&ctx); + return ret; +} + +void md_threadpool_free(ThreadPoolContext **pctx) +{ + ThreadPoolContext *ctx = *pctx; + + if (!ctx) + return; + + pthread_mutex_lock(&ctx->mutex); + ctx->finish = 1; + pthread_cond_broadcast(&ctx->cond); + pthread_mutex_unlock(&ctx->mutex); + + + for (int i = 0; i < ctx->nb_workers; i++) { + WorkerContext *w = &ctx->workers[i]; + pthread_join(w->thread, NULL); + } + + pthread_mutex_destroy(&ctx->mutex); + pthread_cond_destroy(&ctx->cond); + + free(ctx->workers); + + free(ctx); + *pctx = NULL; +} + +void md_threadpool_execute(ThreadPoolContext *ctx, unsigned int nb_jobs, + void (*func)(void *arg, + unsigned int job_idx, unsigned int nb_jobs, + unsigned int thread_idx, unsigned int nb_threads), + void *arg) +{ + pthread_mutex_lock(&ctx->mutex); + + ctx->func = func; + ctx->func_arg = arg; + + ctx->nb_jobs = nb_jobs; + ctx->nb_jobs_finished = 0; + ctx->next_job = 0; + + pthread_cond_broadcast(&ctx->cond); + while (ctx->nb_jobs_finished < ctx->nb_jobs) + pthread_cond_wait(&ctx->cond, &ctx->mutex); + + ctx->func = NULL; + ctx->func_arg = NULL; + + pthread_mutex_unlock(&ctx->mutex); +} diff --git a/src/threadpool.h b/src/threadpool.h new file mode 100644 index 0000000..0f6896d --- /dev/null +++ b/src/threadpool.h @@ -0,0 +1,32 @@ +/* + * Copyright 2016 Anton Khirnov <anton@khirnov.net> + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef MD_THREADPOOL_H +#define MD_THREADPOOL_H + +typedef struct ThreadPoolContext ThreadPoolContext; + +int md_threadpool_init(ThreadPoolContext **ctx, unsigned int nb_threads); +void md_threadpool_free(ThreadPoolContext **ctx); + +void md_threadpool_execute(ThreadPoolContext *ctx, unsigned int nb_jobs, + void (*func)(void *arg, + unsigned int job_idx, unsigned int nb_jobs, + unsigned int thread_idx, unsigned int nb_threads), + void *arg); + +#endif /* MD_THREADPOOL_H */ diff --git a/src/x86inc.asm b/src/x86inc.asm new file mode 100644 index 0000000..dca1f78 --- /dev/null +++ b/src/x86inc.asm @@ -0,0 +1,1544 @@ +;***************************************************************************** +;* x86inc.asm: x264asm abstraction layer +;***************************************************************************** +;* Copyright (C) 2005-2016 x264 project +;* +;* Authors: Loren Merritt <lorenm@u.washington.edu> +;* Anton Mitrofanov <BugMaster@narod.ru> +;* Fiona Glaser <fiona@x264.com> +;* Henrik Gramner <henrik@gramner.com> +;* +;* Permission to use, copy, modify, and/or distribute this software for any +;* purpose with or without fee is hereby granted, provided that the above +;* copyright notice and this permission notice appear in all copies. +;* +;* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +;* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +;* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR +;* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +;* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +;* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF +;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +;***************************************************************************** + +; This is a header file for the x264ASM assembly language, which uses +; NASM/YASM syntax combined with a large number of macros to provide easy +; abstraction between different calling conventions (x86_32, win64, linux64). +; It also has various other useful features to simplify writing the kind of +; DSP functions that are most often used in x264. + +; Unlike the rest of x264, this file is available under an ISC license, as it +; has significant usefulness outside of x264 and we want it to be available +; to the largest audience possible. Of course, if you modify it for your own +; purposes to add a new feature, we strongly encourage contributing a patch +; as this feature might be useful for others as well. Send patches or ideas +; to x264-devel@videolan.org . + +%ifndef private_prefix + %define private_prefix x264 +%endif + +%ifndef public_prefix + %define public_prefix private_prefix +%endif + +%if HAVE_ALIGNED_STACK + %define STACK_ALIGNMENT 16 +%endif +%ifndef STACK_ALIGNMENT + %if ARCH_X86_64 + %define STACK_ALIGNMENT 16 + %else + %define STACK_ALIGNMENT 4 + %endif +%endif + +%define WIN64 0 +%define UNIX64 0 +%if ARCH_X86_64 + %ifidn __OUTPUT_FORMAT__,win32 + %define WIN64 1 + %elifidn __OUTPUT_FORMAT__,win64 + %define WIN64 1 + %elifidn __OUTPUT_FORMAT__,x64 + %define WIN64 1 + %else + %define UNIX64 1 + %endif +%endif + +%define FORMAT_ELF 0 +%ifidn __OUTPUT_FORMAT__,elf + %define FORMAT_ELF 1 +%elifidn __OUTPUT_FORMAT__,elf32 + %define FORMAT_ELF 1 +%elifidn __OUTPUT_FORMAT__,elf64 + %define FORMAT_ELF 1 +%endif + +%ifdef PREFIX + %define mangle(x) _ %+ x +%else + %define mangle(x) x +%endif + +; aout does not support align= +; NOTE: This section is out of sync with x264, in order to +; keep supporting OS/2. +%macro SECTION_RODATA 0-1 16 + %ifidn __OUTPUT_FORMAT__,aout + section .text + %else + SECTION .rodata align=%1 + %endif +%endmacro + +%if WIN64 + %define PIC +%elif ARCH_X86_64 == 0 +; x86_32 doesn't require PIC. +; Some distros prefer shared objects to be PIC, but nothing breaks if +; the code contains a few textrels, so we'll skip that complexity. + %undef PIC +%endif +%ifdef PIC + default rel +%endif + +%macro CPUNOP 1 + %if HAVE_CPUNOP + CPU %1 + %endif +%endmacro + +; Macros to eliminate most code duplication between x86_32 and x86_64: +; Currently this works only for leaf functions which load all their arguments +; into registers at the start, and make no other use of the stack. Luckily that +; covers most of x264's asm. + +; PROLOGUE: +; %1 = number of arguments. loads them from stack if needed. +; %2 = number of registers used. pushes callee-saved regs if needed. +; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed. +; %4 = (optional) stack size to be allocated. The stack will be aligned before +; allocating the specified stack size. If the required stack alignment is +; larger than the known stack alignment the stack will be manually aligned +; and an extra register will be allocated to hold the original stack +; pointer (to not invalidate r0m etc.). To prevent the use of an extra +; register as stack pointer, request a negative stack size. +; %4+/%5+ = list of names to define to registers +; PROLOGUE can also be invoked by adding the same options to cglobal + +; e.g. +; cglobal foo, 2,3,7,0x40, dst, src, tmp +; declares a function (foo) that automatically loads two arguments (dst and +; src) into registers, uses one additional register (tmp) plus 7 vector +; registers (m0-m6) and allocates 0x40 bytes of stack space. + +; TODO Some functions can use some args directly from the stack. If they're the +; last args then you can just not declare them, but if they're in the middle +; we need more flexible macro. + +; RET: +; Pops anything that was pushed by PROLOGUE, and returns. + +; REP_RET: +; Use this instead of RET if it's a branch target. + +; registers: +; rN and rNq are the native-size register holding function argument N +; rNd, rNw, rNb are dword, word, and byte size +; rNh is the high 8 bits of the word size +; rNm is the original location of arg N (a register or on the stack), dword +; rNmp is native size + +%macro DECLARE_REG 2-3 + %define r%1q %2 + %define r%1d %2d + %define r%1w %2w + %define r%1b %2b + %define r%1h %2h + %define %2q %2 + %if %0 == 2 + %define r%1m %2d + %define r%1mp %2 + %elif ARCH_X86_64 ; memory + %define r%1m [rstk + stack_offset + %3] + %define r%1mp qword r %+ %1 %+ m + %else + %define r%1m [rstk + stack_offset + %3] + %define r%1mp dword r %+ %1 %+ m + %endif + %define r%1 %2 +%endmacro + +%macro DECLARE_REG_SIZE 3 + %define r%1q r%1 + %define e%1q r%1 + %define r%1d e%1 + %define e%1d e%1 + %define r%1w %1 + %define e%1w %1 + %define r%1h %3 + %define e%1h %3 + %define r%1b %2 + %define e%1b %2 + %if ARCH_X86_64 == 0 + %define r%1 e%1 + %endif +%endmacro + +DECLARE_REG_SIZE ax, al, ah +DECLARE_REG_SIZE bx, bl, bh +DECLARE_REG_SIZE cx, cl, ch +DECLARE_REG_SIZE dx, dl, dh +DECLARE_REG_SIZE si, sil, null +DECLARE_REG_SIZE di, dil, null +DECLARE_REG_SIZE bp, bpl, null + +; t# defines for when per-arch register allocation is more complex than just function arguments + +%macro DECLARE_REG_TMP 1-* + %assign %%i 0 + %rep %0 + CAT_XDEFINE t, %%i, r%1 + %assign %%i %%i+1 + %rotate 1 + %endrep +%endmacro + +%macro DECLARE_REG_TMP_SIZE 0-* + %rep %0 + %define t%1q t%1 %+ q + %define t%1d t%1 %+ d + %define t%1w t%1 %+ w + %define t%1h t%1 %+ h + %define t%1b t%1 %+ b + %rotate 1 + %endrep +%endmacro + +DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14 + +%if ARCH_X86_64 + %define gprsize 8 +%else + %define gprsize 4 +%endif + +%macro PUSH 1 + push %1 + %ifidn rstk, rsp + %assign stack_offset stack_offset+gprsize + %endif +%endmacro + +%macro POP 1 + pop %1 + %ifidn rstk, rsp + %assign stack_offset stack_offset-gprsize + %endif +%endmacro + +%macro PUSH_IF_USED 1-* + %rep %0 + %if %1 < regs_used + PUSH r%1 + %endif + %rotate 1 + %endrep +%endmacro + +%macro POP_IF_USED 1-* + %rep %0 + %if %1 < regs_used + pop r%1 + %endif + %rotate 1 + %endrep +%endmacro + +%macro LOAD_IF_USED 1-* + %rep %0 + %if %1 < num_args + mov r%1, r %+ %1 %+ mp + %endif + %rotate 1 + %endrep +%endmacro + +%macro SUB 2 + sub %1, %2 + %ifidn %1, rstk + %assign stack_offset stack_offset+(%2) + %endif +%endmacro + +%macro ADD 2 + add %1, %2 + %ifidn %1, rstk + %assign stack_offset stack_offset-(%2) + %endif +%endmacro + +%macro movifnidn 2 + %ifnidn %1, %2 + mov %1, %2 + %endif +%endmacro + +%macro movsxdifnidn 2 + %ifnidn %1, %2 + movsxd %1, %2 + %endif +%endmacro + +%macro ASSERT 1 + %if (%1) == 0 + %error assertion ``%1'' failed + %endif +%endmacro + +%macro DEFINE_ARGS 0-* + %ifdef n_arg_names + %assign %%i 0 + %rep n_arg_names + CAT_UNDEF arg_name %+ %%i, q + CAT_UNDEF arg_name %+ %%i, d + CAT_UNDEF arg_name %+ %%i, w + CAT_UNDEF arg_name %+ %%i, h + CAT_UNDEF arg_name %+ %%i, b + CAT_UNDEF arg_name %+ %%i, m + CAT_UNDEF arg_name %+ %%i, mp + CAT_UNDEF arg_name, %%i + %assign %%i %%i+1 + %endrep + %endif + + %xdefine %%stack_offset stack_offset + %undef stack_offset ; so that the current value of stack_offset doesn't get baked in by xdefine + %assign %%i 0 + %rep %0 + %xdefine %1q r %+ %%i %+ q + %xdefine %1d r %+ %%i %+ d + %xdefine %1w r %+ %%i %+ w + %xdefine %1h r %+ %%i %+ h + %xdefine %1b r %+ %%i %+ b + %xdefine %1m r %+ %%i %+ m + %xdefine %1mp r %+ %%i %+ mp + CAT_XDEFINE arg_name, %%i, %1 + %assign %%i %%i+1 + %rotate 1 + %endrep + %xdefine stack_offset %%stack_offset + %assign n_arg_names %0 +%endmacro + +%define required_stack_alignment ((mmsize + 15) & ~15) + +%macro ALLOC_STACK 1-2 0 ; stack_size, n_xmm_regs (for win64 only) + %ifnum %1 + %if %1 != 0 + %assign %%pad 0 + %assign stack_size %1 + %if stack_size < 0 + %assign stack_size -stack_size + %endif + %if WIN64 + %assign %%pad %%pad + 32 ; shadow space + %if mmsize != 8 + %assign xmm_regs_used %2 + %if xmm_regs_used > 8 + %assign %%pad %%pad + (xmm_regs_used-8)*16 ; callee-saved xmm registers + %endif + %endif + %endif + %if required_stack_alignment <= STACK_ALIGNMENT + ; maintain the current stack alignment + %assign stack_size_padded stack_size + %%pad + ((-%%pad-stack_offset-gprsize) & (STACK_ALIGNMENT-1)) + SUB rsp, stack_size_padded + %else + %assign %%reg_num (regs_used - 1) + %xdefine rstk r %+ %%reg_num + ; align stack, and save original stack location directly above + ; it, i.e. in [rsp+stack_size_padded], so we can restore the + ; stack in a single instruction (i.e. mov rsp, rstk or mov + ; rsp, [rsp+stack_size_padded]) + %if %1 < 0 ; need to store rsp on stack + %xdefine rstkm [rsp + stack_size + %%pad] + %assign %%pad %%pad + gprsize + %else ; can keep rsp in rstk during whole function + %xdefine rstkm rstk + %endif + %assign stack_size_padded stack_size + ((%%pad + required_stack_alignment-1) & ~(required_stack_alignment-1)) + mov rstk, rsp + and rsp, ~(required_stack_alignment-1) + sub rsp, stack_size_padded + movifnidn rstkm, rstk + %endif + WIN64_PUSH_XMM + %endif + %endif +%endmacro + +%macro SETUP_STACK_POINTER 1 + %ifnum %1 + %if %1 != 0 && required_stack_alignment > STACK_ALIGNMENT + %if %1 > 0 + %assign regs_used (regs_used + 1) + %endif + %if ARCH_X86_64 && regs_used < 5 + UNIX64 * 3 + ; Ensure that we don't clobber any registers containing arguments. For UNIX64 we also preserve r6 (rax) + ; since it's used as a hidden argument in vararg functions to specify the number of vector registers used. + %assign regs_used 5 + UNIX64 * 3 + %endif + %endif + %endif +%endmacro + +%macro DEFINE_ARGS_INTERNAL 3+ + %ifnum %2 + DEFINE_ARGS %3 + %elif %1 == 4 + DEFINE_ARGS %2 + %elif %1 > 4 + DEFINE_ARGS %2, %3 + %endif +%endmacro + +%if WIN64 ; Windows x64 ;================================================= + +DECLARE_REG 0, rcx +DECLARE_REG 1, rdx +DECLARE_REG 2, R8 +DECLARE_REG 3, R9 +DECLARE_REG 4, R10, 40 +DECLARE_REG 5, R11, 48 +DECLARE_REG 6, rax, 56 +DECLARE_REG 7, rdi, 64 +DECLARE_REG 8, rsi, 72 +DECLARE_REG 9, rbx, 80 +DECLARE_REG 10, rbp, 88 +DECLARE_REG 11, R12, 96 +DECLARE_REG 12, R13, 104 +DECLARE_REG 13, R14, 112 +DECLARE_REG 14, R15, 120 + +%macro PROLOGUE 2-5+ 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names... + %assign num_args %1 + %assign regs_used %2 + ASSERT regs_used >= num_args + SETUP_STACK_POINTER %4 + ASSERT regs_used <= 15 + PUSH_IF_USED 7, 8, 9, 10, 11, 12, 13, 14 + ALLOC_STACK %4, %3 + %if mmsize != 8 && stack_size == 0 + WIN64_SPILL_XMM %3 + %endif + LOAD_IF_USED 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 + DEFINE_ARGS_INTERNAL %0, %4, %5 +%endmacro + +%macro WIN64_PUSH_XMM 0 + ; Use the shadow space to store XMM6 and XMM7, the rest needs stack space allocated. + %if xmm_regs_used > 6 + movaps [rstk + stack_offset + 8], xmm6 + %endif + %if xmm_regs_used > 7 + movaps [rstk + stack_offset + 24], xmm7 + %endif + %if xmm_regs_used > 8 + %assign %%i 8 + %rep xmm_regs_used-8 + movaps [rsp + (%%i-8)*16 + stack_size + 32], xmm %+ %%i + %assign %%i %%i+1 + %endrep + %endif +%endmacro + +%macro WIN64_SPILL_XMM 1 + %assign xmm_regs_used %1 + ASSERT xmm_regs_used <= 16 + %if xmm_regs_used > 8 + ; Allocate stack space for callee-saved xmm registers plus shadow space and align the stack. + %assign %%pad (xmm_regs_used-8)*16 + 32 + %assign stack_size_padded %%pad + ((-%%pad-stack_offset-gprsize) & (STACK_ALIGNMENT-1)) + SUB rsp, stack_size_padded + %endif + WIN64_PUSH_XMM +%endmacro + +%macro WIN64_RESTORE_XMM_INTERNAL 1 + %assign %%pad_size 0 + %if xmm_regs_used > 8 + %assign %%i xmm_regs_used + %rep xmm_regs_used-8 + %assign %%i %%i-1 + movaps xmm %+ %%i, [%1 + (%%i-8)*16 + stack_size + 32] + %endrep + %endif + %if stack_size_padded > 0 + %if stack_size > 0 && required_stack_alignment > STACK_ALIGNMENT + mov rsp, rstkm + %else + add %1, stack_size_padded + %assign %%pad_size stack_size_padded + %endif + %endif + %if xmm_regs_used > 7 + movaps xmm7, [%1 + stack_offset - %%pad_size + 24] + %endif + %if xmm_regs_used > 6 + movaps xmm6, [%1 + stack_offset - %%pad_size + 8] + %endif +%endmacro + +%macro WIN64_RESTORE_XMM 1 + WIN64_RESTORE_XMM_INTERNAL %1 + %assign stack_offset (stack_offset-stack_size_padded) + %assign xmm_regs_used 0 +%endmacro + +%define has_epilogue regs_used > 7 || xmm_regs_used > 6 || mmsize == 32 || stack_size > 0 + +%macro RET 0 + WIN64_RESTORE_XMM_INTERNAL rsp + POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7 + %if mmsize == 32 + vzeroupper + %endif + AUTO_REP_RET +%endmacro + +%elif ARCH_X86_64 ; *nix x64 ;============================================= + +DECLARE_REG 0, rdi +DECLARE_REG 1, rsi +DECLARE_REG 2, rdx +DECLARE_REG 3, rcx +DECLARE_REG 4, R8 +DECLARE_REG 5, R9 +DECLARE_REG 6, rax, 8 +DECLARE_REG 7, R10, 16 +DECLARE_REG 8, R11, 24 +DECLARE_REG 9, rbx, 32 +DECLARE_REG 10, rbp, 40 +DECLARE_REG 11, R12, 48 +DECLARE_REG 12, R13, 56 +DECLARE_REG 13, R14, 64 +DECLARE_REG 14, R15, 72 + +%macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, [stack_size,] arg_names... + %assign num_args %1 + %assign regs_used %2 + ASSERT regs_used >= num_args + SETUP_STACK_POINTER %4 + ASSERT regs_used <= 15 + PUSH_IF_USED 9, 10, 11, 12, 13, 14 + ALLOC_STACK %4 + LOAD_IF_USED 6, 7, 8, 9, 10, 11, 12, 13, 14 + DEFINE_ARGS_INTERNAL %0, %4, %5 +%endmacro + +%define has_epilogue regs_used > 9 || mmsize == 32 || stack_size > 0 + +%macro RET 0 + %if stack_size_padded > 0 + %if required_stack_alignment > STACK_ALIGNMENT + mov rsp, rstkm + %else + add rsp, stack_size_padded + %endif + %endif + POP_IF_USED 14, 13, 12, 11, 10, 9 + %if mmsize == 32 + vzeroupper + %endif + AUTO_REP_RET +%endmacro + +%else ; X86_32 ;============================================================== + +DECLARE_REG 0, eax, 4 +DECLARE_REG 1, ecx, 8 +DECLARE_REG 2, edx, 12 +DECLARE_REG 3, ebx, 16 +DECLARE_REG 4, esi, 20 +DECLARE_REG 5, edi, 24 +DECLARE_REG 6, ebp, 28 +%define rsp esp + +%macro DECLARE_ARG 1-* + %rep %0 + %define r%1m [rstk + stack_offset + 4*%1 + 4] + %define r%1mp dword r%1m + %rotate 1 + %endrep +%endmacro + +DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14 + +%macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, [stack_size,] arg_names... + %assign num_args %1 + %assign regs_used %2 + ASSERT regs_used >= num_args + %if num_args > 7 + %assign num_args 7 + %endif + %if regs_used > 7 + %assign regs_used 7 + %endif + SETUP_STACK_POINTER %4 + ASSERT regs_used <= 7 + PUSH_IF_USED 3, 4, 5, 6 + ALLOC_STACK %4 + LOAD_IF_USED 0, 1, 2, 3, 4, 5, 6 + DEFINE_ARGS_INTERNAL %0, %4, %5 +%endmacro + +%define has_epilogue regs_used > 3 || mmsize == 32 || stack_size > 0 + +%macro RET 0 + %if stack_size_padded > 0 + %if required_stack_alignment > STACK_ALIGNMENT + mov rsp, rstkm + %else + add rsp, stack_size_padded + %endif + %endif + POP_IF_USED 6, 5, 4, 3 + %if mmsize == 32 + vzeroupper + %endif + AUTO_REP_RET +%endmacro + +%endif ;====================================================================== + +%if WIN64 == 0 + %macro WIN64_SPILL_XMM 1 + %endmacro + %macro WIN64_RESTORE_XMM 1 + %endmacro + %macro WIN64_PUSH_XMM 0 + %endmacro +%endif + +; On AMD cpus <=K10, an ordinary ret is slow if it immediately follows either +; a branch or a branch target. So switch to a 2-byte form of ret in that case. +; We can automatically detect "follows a branch", but not a branch target. +; (SSSE3 is a sufficient condition to know that your cpu doesn't have this problem.) +%macro REP_RET 0 + %if has_epilogue + RET + %else + rep ret + %endif + annotate_function_size +%endmacro + +%define last_branch_adr $$ +%macro AUTO_REP_RET 0 + %if notcpuflag(ssse3) + times ((last_branch_adr-$)>>31)+1 rep ; times 1 iff $ == last_branch_adr. + %endif + ret + annotate_function_size +%endmacro + +%macro BRANCH_INSTR 0-* + %rep %0 + %macro %1 1-2 %1 + %2 %1 + %if notcpuflag(ssse3) + %%branch_instr equ $ + %xdefine last_branch_adr %%branch_instr + %endif + %endmacro + %rotate 1 + %endrep +%endmacro + +BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae, jna, jnae, jb, jbe, jnb, jnbe, jc, jnc, js, jns, jo, jno, jp, jnp + +%macro TAIL_CALL 2 ; callee, is_nonadjacent + %if has_epilogue + call %1 + RET + %elif %2 + jmp %1 + %endif + annotate_function_size +%endmacro + +;============================================================================= +; arch-independent part +;============================================================================= + +%assign function_align 16 + +; Begin a function. +; Applies any symbol mangling needed for C linkage, and sets up a define such that +; subsequent uses of the function name automatically refer to the mangled version. +; Appends cpuflags to the function name if cpuflags has been specified. +; The "" empty default parameter is a workaround for nasm, which fails if SUFFIX +; is empty and we call cglobal_internal with just %1 %+ SUFFIX (without %2). +%macro cglobal 1-2+ "" ; name, [PROLOGUE args] + cglobal_internal 1, %1 %+ SUFFIX, %2 +%endmacro +%macro cvisible 1-2+ "" ; name, [PROLOGUE args] + cglobal_internal 0, %1 %+ SUFFIX, %2 +%endmacro +%macro cglobal_internal 2-3+ + annotate_function_size + %if %1 + %xdefine %%FUNCTION_PREFIX private_prefix + %xdefine %%VISIBILITY hidden + %else + %xdefine %%FUNCTION_PREFIX public_prefix + %xdefine %%VISIBILITY + %endif + %ifndef cglobaled_%2 + %xdefine %2 mangle(%%FUNCTION_PREFIX %+ _ %+ %2) + %xdefine %2.skip_prologue %2 %+ .skip_prologue + CAT_XDEFINE cglobaled_, %2, 1 + %endif + %xdefine current_function %2 + %xdefine current_function_section __SECT__ + %if FORMAT_ELF + global %2:function %%VISIBILITY + %else + global %2 + %endif + align function_align + %2: + RESET_MM_PERMUTATION ; needed for x86-64, also makes disassembly somewhat nicer + %xdefine rstk rsp ; copy of the original stack pointer, used when greater alignment than the known stack alignment is required + %assign stack_offset 0 ; stack pointer offset relative to the return address + %assign stack_size 0 ; amount of stack space that can be freely used inside a function + %assign stack_size_padded 0 ; total amount of allocated stack space, including space for callee-saved xmm registers on WIN64 and alignment padding + %assign xmm_regs_used 0 ; number of XMM registers requested, used for dealing with callee-saved registers on WIN64 + %ifnidn %3, "" + PROLOGUE %3 + %endif +%endmacro + +%macro cextern 1 + %xdefine %1 mangle(private_prefix %+ _ %+ %1) + CAT_XDEFINE cglobaled_, %1, 1 + extern %1 +%endmacro + +; like cextern, but without the prefix +%macro cextern_naked 1 + %ifdef PREFIX + %xdefine %1 mangle(%1) + %endif + CAT_XDEFINE cglobaled_, %1, 1 + extern %1 +%endmacro + +%macro const 1-2+ + %xdefine %1 mangle(private_prefix %+ _ %+ %1) + %if FORMAT_ELF + global %1:data hidden + %else + global %1 + %endif + %1: %2 +%endmacro + +; This is needed for ELF, otherwise the GNU linker assumes the stack is executable by default. +%if FORMAT_ELF + [SECTION .note.GNU-stack noalloc noexec nowrite progbits] +%endif + +; Tell debuggers how large the function was. +; This may be invoked multiple times per function; we rely on later instances overriding earlier ones. +; This is invoked by RET and similar macros, and also cglobal does it for the previous function, +; but if the last function in a source file doesn't use any of the standard macros for its epilogue, +; then its size might be unspecified. +%macro annotate_function_size 0 + %ifdef __YASM_VER__ + %ifdef current_function + %if FORMAT_ELF + current_function_section + %%ecf equ $ + size current_function %%ecf - current_function + __SECT__ + %endif + %endif + %endif +%endmacro + +; cpuflags + +%assign cpuflags_mmx (1<<0) +%assign cpuflags_mmx2 (1<<1) | cpuflags_mmx +%assign cpuflags_3dnow (1<<2) | cpuflags_mmx +%assign cpuflags_3dnowext (1<<3) | cpuflags_3dnow +%assign cpuflags_sse (1<<4) | cpuflags_mmx2 +%assign cpuflags_sse2 (1<<5) | cpuflags_sse +%assign cpuflags_sse2slow (1<<6) | cpuflags_sse2 +%assign cpuflags_sse3 (1<<7) | cpuflags_sse2 +%assign cpuflags_ssse3 (1<<8) | cpuflags_sse3 +%assign cpuflags_sse4 (1<<9) | cpuflags_ssse3 +%assign cpuflags_sse42 (1<<10)| cpuflags_sse4 +%assign cpuflags_avx (1<<11)| cpuflags_sse42 +%assign cpuflags_xop (1<<12)| cpuflags_avx +%assign cpuflags_fma4 (1<<13)| cpuflags_avx +%assign cpuflags_fma3 (1<<14)| cpuflags_avx +%assign cpuflags_avx2 (1<<15)| cpuflags_fma3 + +%assign cpuflags_cache32 (1<<16) +%assign cpuflags_cache64 (1<<17) +%assign cpuflags_slowctz (1<<18) +%assign cpuflags_lzcnt (1<<19) +%assign cpuflags_aligned (1<<20) ; not a cpu feature, but a function variant +%assign cpuflags_atom (1<<21) +%assign cpuflags_bmi1 (1<<22)|cpuflags_lzcnt +%assign cpuflags_bmi2 (1<<23)|cpuflags_bmi1 + +; Returns a boolean value expressing whether or not the specified cpuflag is enabled. +%define cpuflag(x) (((((cpuflags & (cpuflags_ %+ x)) ^ (cpuflags_ %+ x)) - 1) >> 31) & 1) +%define notcpuflag(x) (cpuflag(x) ^ 1) + +; Takes an arbitrary number of cpuflags from the above list. +; All subsequent functions (up to the next INIT_CPUFLAGS) is built for the specified cpu. +; You shouldn't need to invoke this macro directly, it's a subroutine for INIT_MMX &co. +%macro INIT_CPUFLAGS 0-* + %xdefine SUFFIX + %undef cpuname + %assign cpuflags 0 + + %if %0 >= 1 + %rep %0 + %ifdef cpuname + %xdefine cpuname cpuname %+ _%1 + %else + %xdefine cpuname %1 + %endif + %assign cpuflags cpuflags | cpuflags_%1 + %rotate 1 + %endrep + %xdefine SUFFIX _ %+ cpuname + + %if cpuflag(avx) + %assign avx_enabled 1 + %endif + %if (mmsize == 16 && notcpuflag(sse2)) || (mmsize == 32 && notcpuflag(avx2)) + %define mova movaps + %define movu movups + %define movnta movntps + %endif + %if cpuflag(aligned) + %define movu mova + %elif cpuflag(sse3) && notcpuflag(ssse3) + %define movu lddqu + %endif + %endif + + %if ARCH_X86_64 || cpuflag(sse2) + CPUNOP amdnop + %else + CPUNOP basicnop + %endif +%endmacro + +; Merge mmx and sse* +; m# is a simd register of the currently selected size +; xm# is the corresponding xmm register if mmsize >= 16, otherwise the same as m# +; ym# is the corresponding ymm register if mmsize >= 32, otherwise the same as m# +; (All 3 remain in sync through SWAP.) + +%macro CAT_XDEFINE 3 + %xdefine %1%2 %3 +%endmacro + +%macro CAT_UNDEF 2 + %undef %1%2 +%endmacro + +%macro INIT_MMX 0-1+ + %assign avx_enabled 0 + %define RESET_MM_PERMUTATION INIT_MMX %1 + %define mmsize 8 + %define num_mmregs 8 + %define mova movq + %define movu movq + %define movh movd + %define movnta movntq + %assign %%i 0 + %rep 8 + CAT_XDEFINE m, %%i, mm %+ %%i + CAT_XDEFINE nnmm, %%i, %%i + %assign %%i %%i+1 + %endrep + %rep 8 + CAT_UNDEF m, %%i + CAT_UNDEF nnmm, %%i + %assign %%i %%i+1 + %endrep + INIT_CPUFLAGS %1 +%endmacro + +%macro INIT_XMM 0-1+ + %assign avx_enabled 0 + %define RESET_MM_PERMUTATION INIT_XMM %1 + %define mmsize 16 + %define num_mmregs 8 + %if ARCH_X86_64 + %define num_mmregs 16 + %endif + %define mova movdqa + %define movu movdqu + %define movh movq + %define movnta movntdq + %assign %%i 0 + %rep num_mmregs + CAT_XDEFINE m, %%i, xmm %+ %%i + CAT_XDEFINE nnxmm, %%i, %%i + %assign %%i %%i+1 + %endrep + INIT_CPUFLAGS %1 +%endmacro + +%macro INIT_YMM 0-1+ + %assign avx_enabled 1 + %define RESET_MM_PERMUTATION INIT_YMM %1 + %define mmsize 32 + %define num_mmregs 8 + %if ARCH_X86_64 + %define num_mmregs 16 + %endif + %define mova movdqa + %define movu movdqu + %undef movh + %define movnta movntdq + %assign %%i 0 + %rep num_mmregs + CAT_XDEFINE m, %%i, ymm %+ %%i + CAT_XDEFINE nnymm, %%i, %%i + %assign %%i %%i+1 + %endrep + INIT_CPUFLAGS %1 +%endmacro + +INIT_XMM + +%macro DECLARE_MMCAST 1 + %define mmmm%1 mm%1 + %define mmxmm%1 mm%1 + %define mmymm%1 mm%1 + %define xmmmm%1 mm%1 + %define xmmxmm%1 xmm%1 + %define xmmymm%1 xmm%1 + %define ymmmm%1 mm%1 + %define ymmxmm%1 xmm%1 + %define ymmymm%1 ymm%1 + %define xm%1 xmm %+ m%1 + %define ym%1 ymm %+ m%1 +%endmacro + +%assign i 0 +%rep 16 + DECLARE_MMCAST i + %assign i i+1 +%endrep + +; I often want to use macros that permute their arguments. e.g. there's no +; efficient way to implement butterfly or transpose or dct without swapping some +; arguments. +; +; I would like to not have to manually keep track of the permutations: +; If I insert a permutation in the middle of a function, it should automatically +; change everything that follows. For more complex macros I may also have multiple +; implementations, e.g. the SSE2 and SSSE3 versions may have different permutations. +; +; Hence these macros. Insert a PERMUTE or some SWAPs at the end of a macro that +; permutes its arguments. It's equivalent to exchanging the contents of the +; registers, except that this way you exchange the register names instead, so it +; doesn't cost any cycles. + +%macro PERMUTE 2-* ; takes a list of pairs to swap + %rep %0/2 + %xdefine %%tmp%2 m%2 + %rotate 2 + %endrep + %rep %0/2 + %xdefine m%1 %%tmp%2 + CAT_XDEFINE nn, m%1, %1 + %rotate 2 + %endrep +%endmacro + +%macro SWAP 2+ ; swaps a single chain (sometimes more concise than pairs) + %ifnum %1 ; SWAP 0, 1, ... + SWAP_INTERNAL_NUM %1, %2 + %else ; SWAP m0, m1, ... + SWAP_INTERNAL_NAME %1, %2 + %endif +%endmacro + +%macro SWAP_INTERNAL_NUM 2-* + %rep %0-1 + %xdefine %%tmp m%1 + %xdefine m%1 m%2 + %xdefine m%2 %%tmp + CAT_XDEFINE nn, m%1, %1 + CAT_XDEFINE nn, m%2, %2 + %rotate 1 + %endrep +%endmacro + +%macro SWAP_INTERNAL_NAME 2-* + %xdefine %%args nn %+ %1 + %rep %0-1 + %xdefine %%args %%args, nn %+ %2 + %rotate 1 + %endrep + SWAP_INTERNAL_NUM %%args +%endmacro + +; If SAVE_MM_PERMUTATION is placed at the end of a function, then any later +; calls to that function will automatically load the permutation, so values can +; be returned in mmregs. +%macro SAVE_MM_PERMUTATION 0-1 + %if %0 + %xdefine %%f %1_m + %else + %xdefine %%f current_function %+ _m + %endif + %assign %%i 0 + %rep num_mmregs + CAT_XDEFINE %%f, %%i, m %+ %%i + %assign %%i %%i+1 + %endrep +%endmacro + +%macro LOAD_MM_PERMUTATION 1 ; name to load from + %ifdef %1_m0 + %assign %%i 0 + %rep num_mmregs + CAT_XDEFINE m, %%i, %1_m %+ %%i + CAT_XDEFINE nn, m %+ %%i, %%i + %assign %%i %%i+1 + %endrep + %endif +%endmacro + +; Append cpuflags to the callee's name iff the appended name is known and the plain name isn't +%macro call 1 + call_internal %1 %+ SUFFIX, %1 +%endmacro +%macro call_internal 2 + %xdefine %%i %2 + %ifndef cglobaled_%2 + %ifdef cglobaled_%1 + %xdefine %%i %1 + %endif + %endif + call %%i + LOAD_MM_PERMUTATION %%i +%endmacro + +; Substitutions that reduce instruction size but are functionally equivalent +%macro add 2 + %ifnum %2 + %if %2==128 + sub %1, -128 + %else + add %1, %2 + %endif + %else + add %1, %2 + %endif +%endmacro + +%macro sub 2 + %ifnum %2 + %if %2==128 + add %1, -128 + %else + sub %1, %2 + %endif + %else + sub %1, %2 + %endif +%endmacro + +;============================================================================= +; AVX abstraction layer +;============================================================================= + +%assign i 0 +%rep 16 + %if i < 8 + CAT_XDEFINE sizeofmm, i, 8 + %endif + CAT_XDEFINE sizeofxmm, i, 16 + CAT_XDEFINE sizeofymm, i, 32 + %assign i i+1 +%endrep +%undef i + +%macro CHECK_AVX_INSTR_EMU 3-* + %xdefine %%opcode %1 + %xdefine %%dst %2 + %rep %0-2 + %ifidn %%dst, %3 + %error non-avx emulation of ``%%opcode'' is not supported + %endif + %rotate 1 + %endrep +%endmacro + +;%1 == instruction +;%2 == minimal instruction set +;%3 == 1 if float, 0 if int +;%4 == 1 if 4-operand emulation, 0 if 3-operand emulation, 255 otherwise (no emulation) +;%5 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not +;%6+: operands +%macro RUN_AVX_INSTR 6-9+ + %ifnum sizeof%7 + %assign __sizeofreg sizeof%7 + %elifnum sizeof%6 + %assign __sizeofreg sizeof%6 + %else + %assign __sizeofreg mmsize + %endif + %assign __emulate_avx 0 + %if avx_enabled && __sizeofreg >= 16 + %xdefine __instr v%1 + %else + %xdefine __instr %1 + %if %0 >= 8+%4 + %assign __emulate_avx 1 + %endif + %endif + %ifnidn %2, fnord + %ifdef cpuname + %if notcpuflag(%2) + %error use of ``%1'' %2 instruction in cpuname function: current_function + %elif cpuflags_%2 < cpuflags_sse && notcpuflag(sse2) && __sizeofreg > 8 + %error use of ``%1'' sse2 instruction in cpuname function: current_function + %endif + %endif + %endif + + %if __emulate_avx + %xdefine __src1 %7 + %xdefine __src2 %8 + %if %5 && %4 == 0 + %ifnidn %6, %7 + %ifidn %6, %8 + %xdefine __src1 %8 + %xdefine __src2 %7 + %elifnnum sizeof%8 + ; 3-operand AVX instructions with a memory arg can only have it in src2, + ; whereas SSE emulation prefers to have it in src1 (i.e. the mov). + ; So, if the instruction is commutative with a memory arg, swap them. + %xdefine __src1 %8 + %xdefine __src2 %7 + %endif + %endif + %endif + %ifnidn %6, __src1 + %if %0 >= 9 + CHECK_AVX_INSTR_EMU {%1 %6, %7, %8, %9}, %6, __src2, %9 + %else + CHECK_AVX_INSTR_EMU {%1 %6, %7, %8}, %6, __src2 + %endif + %if __sizeofreg == 8 + MOVQ %6, __src1 + %elif %3 + MOVAPS %6, __src1 + %else + MOVDQA %6, __src1 + %endif + %endif + %if %0 >= 9 + %1 %6, __src2, %9 + %else + %1 %6, __src2 + %endif + %elif %0 >= 9 + __instr %6, %7, %8, %9 + %elif %0 == 8 + __instr %6, %7, %8 + %elif %0 == 7 + __instr %6, %7 + %else + __instr %6 + %endif +%endmacro + +;%1 == instruction +;%2 == minimal instruction set +;%3 == 1 if float, 0 if int +;%4 == 1 if 4-operand emulation, 0 if 3-operand emulation, 255 otherwise (no emulation) +;%5 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not +%macro AVX_INSTR 1-5 fnord, 0, 255, 0 + %macro %1 1-10 fnord, fnord, fnord, fnord, %1, %2, %3, %4, %5 + %ifidn %2, fnord + RUN_AVX_INSTR %6, %7, %8, %9, %10, %1 + %elifidn %3, fnord + RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2 + %elifidn %4, fnord + RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3 + %elifidn %5, fnord + RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3, %4 + %else + RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3, %4, %5 + %endif + %endmacro +%endmacro + +; Instructions with both VEX and non-VEX encodings +; Non-destructive instructions are written without parameters +AVX_INSTR addpd, sse2, 1, 0, 1 +AVX_INSTR addps, sse, 1, 0, 1 +AVX_INSTR addsd, sse2, 1, 0, 0 +AVX_INSTR addss, sse, 1, 0, 0 +AVX_INSTR addsubpd, sse3, 1, 0, 0 +AVX_INSTR addsubps, sse3, 1, 0, 0 +AVX_INSTR aesdec, fnord, 0, 0, 0 +AVX_INSTR aesdeclast, fnord, 0, 0, 0 +AVX_INSTR aesenc, fnord, 0, 0, 0 +AVX_INSTR aesenclast, fnord, 0, 0, 0 +AVX_INSTR aesimc +AVX_INSTR aeskeygenassist +AVX_INSTR andnpd, sse2, 1, 0, 0 +AVX_INSTR andnps, sse, 1, 0, 0 +AVX_INSTR andpd, sse2, 1, 0, 1 +AVX_INSTR andps, sse, 1, 0, 1 +AVX_INSTR blendpd, sse4, 1, 1, 0 +AVX_INSTR blendps, sse4, 1, 1, 0 +AVX_INSTR blendvpd, sse4 ; can't be emulated +AVX_INSTR blendvps, sse4 ; can't be emulated +AVX_INSTR cmppd, sse2, 1, 1, 0 +AVX_INSTR cmpps, sse, 1, 1, 0 +AVX_INSTR cmpsd, sse2, 1, 1, 0 +AVX_INSTR cmpss, sse, 1, 1, 0 +AVX_INSTR comisd, sse2 +AVX_INSTR comiss, sse +AVX_INSTR cvtdq2pd, sse2 +AVX_INSTR cvtdq2ps, sse2 +AVX_INSTR cvtpd2dq, sse2 +AVX_INSTR cvtpd2ps, sse2 +AVX_INSTR cvtps2dq, sse2 +AVX_INSTR cvtps2pd, sse2 +AVX_INSTR cvtsd2si, sse2 +AVX_INSTR cvtsd2ss, sse2, 1, 0, 0 +AVX_INSTR cvtsi2sd, sse2, 1, 0, 0 +AVX_INSTR cvtsi2ss, sse, 1, 0, 0 +AVX_INSTR cvtss2sd, sse2, 1, 0, 0 +AVX_INSTR cvtss2si, sse +AVX_INSTR cvttpd2dq, sse2 +AVX_INSTR cvttps2dq, sse2 +AVX_INSTR cvttsd2si, sse2 +AVX_INSTR cvttss2si, sse +AVX_INSTR divpd, sse2, 1, 0, 0 +AVX_INSTR divps, sse, 1, 0, 0 +AVX_INSTR divsd, sse2, 1, 0, 0 +AVX_INSTR divss, sse, 1, 0, 0 +AVX_INSTR dppd, sse4, 1, 1, 0 +AVX_INSTR dpps, sse4, 1, 1, 0 +AVX_INSTR extractps, sse4 +AVX_INSTR haddpd, sse3, 1, 0, 0 +AVX_INSTR haddps, sse3, 1, 0, 0 +AVX_INSTR hsubpd, sse3, 1, 0, 0 +AVX_INSTR hsubps, sse3, 1, 0, 0 +AVX_INSTR insertps, sse4, 1, 1, 0 +AVX_INSTR lddqu, sse3 +AVX_INSTR ldmxcsr, sse +AVX_INSTR maskmovdqu, sse2 +AVX_INSTR maxpd, sse2, 1, 0, 1 +AVX_INSTR maxps, sse, 1, 0, 1 +AVX_INSTR maxsd, sse2, 1, 0, 0 +AVX_INSTR maxss, sse, 1, 0, 0 +AVX_INSTR minpd, sse2, 1, 0, 1 +AVX_INSTR minps, sse, 1, 0, 1 +AVX_INSTR minsd, sse2, 1, 0, 0 +AVX_INSTR minss, sse, 1, 0, 0 +AVX_INSTR movapd, sse2 +AVX_INSTR movaps, sse +AVX_INSTR movd, mmx +AVX_INSTR movddup, sse3 +AVX_INSTR movdqa, sse2 +AVX_INSTR movdqu, sse2 +AVX_INSTR movhlps, sse, 1, 0, 0 +AVX_INSTR movhpd, sse2, 1, 0, 0 +AVX_INSTR movhps, sse, 1, 0, 0 +AVX_INSTR movlhps, sse, 1, 0, 0 +AVX_INSTR movlpd, sse2, 1, 0, 0 +AVX_INSTR movlps, sse, 1, 0, 0 +AVX_INSTR movmskpd, sse2 +AVX_INSTR movmskps, sse +AVX_INSTR movntdq, sse2 +AVX_INSTR movntdqa, sse4 +AVX_INSTR movntpd, sse2 +AVX_INSTR movntps, sse +AVX_INSTR movq, mmx +AVX_INSTR movsd, sse2, 1, 0, 0 +AVX_INSTR movshdup, sse3 +AVX_INSTR movsldup, sse3 +AVX_INSTR movss, sse, 1, 0, 0 +AVX_INSTR movupd, sse2 +AVX_INSTR movups, sse +AVX_INSTR mpsadbw, sse4, 0, 1, 0 +AVX_INSTR mulpd, sse2, 1, 0, 1 +AVX_INSTR mulps, sse, 1, 0, 1 +AVX_INSTR mulsd, sse2, 1, 0, 0 +AVX_INSTR mulss, sse, 1, 0, 0 +AVX_INSTR orpd, sse2, 1, 0, 1 +AVX_INSTR orps, sse, 1, 0, 1 +AVX_INSTR pabsb, ssse3 +AVX_INSTR pabsd, ssse3 +AVX_INSTR pabsw, ssse3 +AVX_INSTR packsswb, mmx, 0, 0, 0 +AVX_INSTR packssdw, mmx, 0, 0, 0 +AVX_INSTR packuswb, mmx, 0, 0, 0 +AVX_INSTR packusdw, sse4, 0, 0, 0 +AVX_INSTR paddb, mmx, 0, 0, 1 +AVX_INSTR paddw, mmx, 0, 0, 1 +AVX_INSTR paddd, mmx, 0, 0, 1 +AVX_INSTR paddq, sse2, 0, 0, 1 +AVX_INSTR paddsb, mmx, 0, 0, 1 +AVX_INSTR paddsw, mmx, 0, 0, 1 +AVX_INSTR paddusb, mmx, 0, 0, 1 +AVX_INSTR paddusw, mmx, 0, 0, 1 +AVX_INSTR palignr, ssse3, 0, 1, 0 +AVX_INSTR pand, mmx, 0, 0, 1 +AVX_INSTR pandn, mmx, 0, 0, 0 +AVX_INSTR pavgb, mmx2, 0, 0, 1 +AVX_INSTR pavgw, mmx2, 0, 0, 1 +AVX_INSTR pblendvb, sse4 ; can't be emulated +AVX_INSTR pblendw, sse4, 0, 1, 0 +AVX_INSTR pclmulqdq, fnord, 0, 1, 0 +AVX_INSTR pclmulhqhqdq, fnord, 0, 0, 0 +AVX_INSTR pclmulhqlqdq, fnord, 0, 0, 0 +AVX_INSTR pclmullqhqdq, fnord, 0, 0, 0 +AVX_INSTR pclmullqlqdq, fnord, 0, 0, 0 +AVX_INSTR pcmpestri, sse42 +AVX_INSTR pcmpestrm, sse42 +AVX_INSTR pcmpistri, sse42 +AVX_INSTR pcmpistrm, sse42 +AVX_INSTR pcmpeqb, mmx, 0, 0, 1 +AVX_INSTR pcmpeqw, mmx, 0, 0, 1 +AVX_INSTR pcmpeqd, mmx, 0, 0, 1 +AVX_INSTR pcmpeqq, sse4, 0, 0, 1 +AVX_INSTR pcmpgtb, mmx, 0, 0, 0 +AVX_INSTR pcmpgtw, mmx, 0, 0, 0 +AVX_INSTR pcmpgtd, mmx, 0, 0, 0 +AVX_INSTR pcmpgtq, sse42, 0, 0, 0 +AVX_INSTR pextrb, sse4 +AVX_INSTR pextrd, sse4 +AVX_INSTR pextrq, sse4 +AVX_INSTR pextrw, mmx2 +AVX_INSTR phaddw, ssse3, 0, 0, 0 +AVX_INSTR phaddd, ssse3, 0, 0, 0 +AVX_INSTR phaddsw, ssse3, 0, 0, 0 +AVX_INSTR phminposuw, sse4 +AVX_INSTR phsubw, ssse3, 0, 0, 0 +AVX_INSTR phsubd, ssse3, 0, 0, 0 +AVX_INSTR phsubsw, ssse3, 0, 0, 0 +AVX_INSTR pinsrb, sse4, 0, 1, 0 +AVX_INSTR pinsrd, sse4, 0, 1, 0 +AVX_INSTR pinsrq, sse4, 0, 1, 0 +AVX_INSTR pinsrw, mmx2, 0, 1, 0 +AVX_INSTR pmaddwd, mmx, 0, 0, 1 +AVX_INSTR pmaddubsw, ssse3, 0, 0, 0 +AVX_INSTR pmaxsb, sse4, 0, 0, 1 +AVX_INSTR pmaxsw, mmx2, 0, 0, 1 +AVX_INSTR pmaxsd, sse4, 0, 0, 1 +AVX_INSTR pmaxub, mmx2, 0, 0, 1 +AVX_INSTR pmaxuw, sse4, 0, 0, 1 +AVX_INSTR pmaxud, sse4, 0, 0, 1 +AVX_INSTR pminsb, sse4, 0, 0, 1 +AVX_INSTR pminsw, mmx2, 0, 0, 1 +AVX_INSTR pminsd, sse4, 0, 0, 1 +AVX_INSTR pminub, mmx2, 0, 0, 1 +AVX_INSTR pminuw, sse4, 0, 0, 1 +AVX_INSTR pminud, sse4, 0, 0, 1 +AVX_INSTR pmovmskb, mmx2 +AVX_INSTR pmovsxbw, sse4 +AVX_INSTR pmovsxbd, sse4 +AVX_INSTR pmovsxbq, sse4 +AVX_INSTR pmovsxwd, sse4 +AVX_INSTR pmovsxwq, sse4 +AVX_INSTR pmovsxdq, sse4 +AVX_INSTR pmovzxbw, sse4 +AVX_INSTR pmovzxbd, sse4 +AVX_INSTR pmovzxbq, sse4 +AVX_INSTR pmovzxwd, sse4 +AVX_INSTR pmovzxwq, sse4 +AVX_INSTR pmovzxdq, sse4 +AVX_INSTR pmuldq, sse4, 0, 0, 1 +AVX_INSTR pmulhrsw, ssse3, 0, 0, 1 +AVX_INSTR pmulhuw, mmx2, 0, 0, 1 +AVX_INSTR pmulhw, mmx, 0, 0, 1 +AVX_INSTR pmullw, mmx, 0, 0, 1 +AVX_INSTR pmulld, sse4, 0, 0, 1 +AVX_INSTR pmuludq, sse2, 0, 0, 1 +AVX_INSTR por, mmx, 0, 0, 1 +AVX_INSTR psadbw, mmx2, 0, 0, 1 +AVX_INSTR pshufb, ssse3, 0, 0, 0 +AVX_INSTR pshufd, sse2 +AVX_INSTR pshufhw, sse2 +AVX_INSTR pshuflw, sse2 +AVX_INSTR psignb, ssse3, 0, 0, 0 +AVX_INSTR psignw, ssse3, 0, 0, 0 +AVX_INSTR psignd, ssse3, 0, 0, 0 +AVX_INSTR psllw, mmx, 0, 0, 0 +AVX_INSTR pslld, mmx, 0, 0, 0 +AVX_INSTR psllq, mmx, 0, 0, 0 +AVX_INSTR pslldq, sse2, 0, 0, 0 +AVX_INSTR psraw, mmx, 0, 0, 0 +AVX_INSTR psrad, mmx, 0, 0, 0 +AVX_INSTR psrlw, mmx, 0, 0, 0 +AVX_INSTR psrld, mmx, 0, 0, 0 +AVX_INSTR psrlq, mmx, 0, 0, 0 +AVX_INSTR psrldq, sse2, 0, 0, 0 +AVX_INSTR psubb, mmx, 0, 0, 0 +AVX_INSTR psubw, mmx, 0, 0, 0 +AVX_INSTR psubd, mmx, 0, 0, 0 +AVX_INSTR psubq, sse2, 0, 0, 0 +AVX_INSTR psubsb, mmx, 0, 0, 0 +AVX_INSTR psubsw, mmx, 0, 0, 0 +AVX_INSTR psubusb, mmx, 0, 0, 0 +AVX_INSTR psubusw, mmx, 0, 0, 0 +AVX_INSTR ptest, sse4 +AVX_INSTR punpckhbw, mmx, 0, 0, 0 +AVX_INSTR punpckhwd, mmx, 0, 0, 0 +AVX_INSTR punpckhdq, mmx, 0, 0, 0 +AVX_INSTR punpckhqdq, sse2, 0, 0, 0 +AVX_INSTR punpcklbw, mmx, 0, 0, 0 +AVX_INSTR punpcklwd, mmx, 0, 0, 0 +AVX_INSTR punpckldq, mmx, 0, 0, 0 +AVX_INSTR punpcklqdq, sse2, 0, 0, 0 +AVX_INSTR pxor, mmx, 0, 0, 1 +AVX_INSTR rcpps, sse +AVX_INSTR rcpss, sse, 1, 0, 0 +AVX_INSTR roundpd, sse4 +AVX_INSTR roundps, sse4 +AVX_INSTR roundsd, sse4, 1, 1, 0 +AVX_INSTR roundss, sse4, 1, 1, 0 +AVX_INSTR rsqrtps, sse +AVX_INSTR rsqrtss, sse, 1, 0, 0 +AVX_INSTR shufpd, sse2, 1, 1, 0 +AVX_INSTR shufps, sse, 1, 1, 0 +AVX_INSTR sqrtpd, sse2 +AVX_INSTR sqrtps, sse +AVX_INSTR sqrtsd, sse2, 1, 0, 0 +AVX_INSTR sqrtss, sse, 1, 0, 0 +AVX_INSTR stmxcsr, sse +AVX_INSTR subpd, sse2, 1, 0, 0 +AVX_INSTR subps, sse, 1, 0, 0 +AVX_INSTR subsd, sse2, 1, 0, 0 +AVX_INSTR subss, sse, 1, 0, 0 +AVX_INSTR ucomisd, sse2 +AVX_INSTR ucomiss, sse +AVX_INSTR unpckhpd, sse2, 1, 0, 0 +AVX_INSTR unpckhps, sse, 1, 0, 0 +AVX_INSTR unpcklpd, sse2, 1, 0, 0 +AVX_INSTR unpcklps, sse, 1, 0, 0 +AVX_INSTR xorpd, sse2, 1, 0, 1 +AVX_INSTR xorps, sse, 1, 0, 1 + +; 3DNow instructions, for sharing code between AVX, SSE and 3DN +AVX_INSTR pfadd, 3dnow, 1, 0, 1 +AVX_INSTR pfsub, 3dnow, 1, 0, 0 +AVX_INSTR pfmul, 3dnow, 1, 0, 1 + +; base-4 constants for shuffles +%assign i 0 +%rep 256 + %assign j ((i>>6)&3)*1000 + ((i>>4)&3)*100 + ((i>>2)&3)*10 + (i&3) + %if j < 10 + CAT_XDEFINE q000, j, i + %elif j < 100 + CAT_XDEFINE q00, j, i + %elif j < 1000 + CAT_XDEFINE q0, j, i + %else + CAT_XDEFINE q, j, i + %endif + %assign i i+1 +%endrep +%undef i +%undef j + +%macro FMA_INSTR 3 + %macro %1 4-7 %1, %2, %3 + %if cpuflag(xop) + v%5 %1, %2, %3, %4 + %elifnidn %1, %4 + %6 %1, %2, %3 + %7 %1, %4 + %else + %error non-xop emulation of ``%5 %1, %2, %3, %4'' is not supported + %endif + %endmacro +%endmacro + +FMA_INSTR pmacsww, pmullw, paddw +FMA_INSTR pmacsdd, pmulld, paddd ; sse4 emulation +FMA_INSTR pmacsdql, pmuldq, paddq ; sse4 emulation +FMA_INSTR pmadcswd, pmaddwd, paddd + +; tzcnt is equivalent to "rep bsf" and is backwards-compatible with bsf. +; This lets us use tzcnt without bumping the yasm version requirement yet. +%define tzcnt rep bsf + +; Macros for consolidating FMA3 and FMA4 using 4-operand (dst, src1, src2, src3) syntax. +; FMA3 is only possible if dst is the same as one of the src registers. +; Either src2 or src3 can be a memory operand. +%macro FMA4_INSTR 2-* + %push fma4_instr + %xdefine %$prefix %1 + %rep %0 - 1 + %macro %$prefix%2 4-6 %$prefix, %2 + %if notcpuflag(fma3) && notcpuflag(fma4) + %error use of ``%5%6'' fma instruction in cpuname function: current_function + %elif cpuflag(fma4) + v%5%6 %1, %2, %3, %4 + %elifidn %1, %2 + ; If %3 or %4 is a memory operand it needs to be encoded as the last operand. + %ifnum sizeof%3 + v%{5}213%6 %2, %3, %4 + %else + v%{5}132%6 %2, %4, %3 + %endif + %elifidn %1, %3 + v%{5}213%6 %3, %2, %4 + %elifidn %1, %4 + v%{5}231%6 %4, %2, %3 + %else + %error fma3 emulation of ``%5%6 %1, %2, %3, %4'' is not supported + %endif + %endmacro + %rotate 1 + %endrep + %pop +%endmacro + +FMA4_INSTR fmadd, pd, ps, sd, ss +FMA4_INSTR fmaddsub, pd, ps +FMA4_INSTR fmsub, pd, ps, sd, ss +FMA4_INSTR fmsubadd, pd, ps +FMA4_INSTR fnmadd, pd, ps, sd, ss +FMA4_INSTR fnmsub, pd, ps, sd, ss + +; workaround: vpbroadcastq is broken in x86_32 due to a yasm bug (fixed in 1.3.0) +%ifdef __YASM_VER__ + %if __YASM_VERSION_ID__ < 0x01030000 && ARCH_X86_64 == 0 + %macro vpbroadcastq 2 + %if sizeof%1 == 16 + movddup %1, %2 + %else + vbroadcastsd %1, %2 + %endif + %endmacro + %endif +%endif diff --git a/src/x86util.asm b/src/x86util.asm new file mode 100644 index 0000000..66280b2 --- /dev/null +++ b/src/x86util.asm @@ -0,0 +1,695 @@ +;***************************************************************************** +;* x86util.asm +;***************************************************************************** +;* Copyright (C) 2008-2010 x264 project +;* +;* Authors: Loren Merritt <lorenm@u.washington.edu> +;* Holger Lubitz <holger@lubitz.org> +;* +;* This file is part of Libav. +;* +;* Libav is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* Libav is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with Libav; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%define private_prefix qms +%define public_prefix qms +%define cpuflags_mmxext cpuflags_mmx2 + +%include "config.asm" + +%include "x86inc.asm" + +%macro SBUTTERFLY 4 +%if avx_enabled == 0 + mova m%4, m%2 + punpckl%1 m%2, m%3 + punpckh%1 m%4, m%3 +%else + punpckh%1 m%4, m%2, m%3 + punpckl%1 m%2, m%3 +%endif + SWAP %3, %4 +%endmacro + +%macro SBUTTERFLY2 4 + punpckl%1 m%4, m%2, m%3 + punpckh%1 m%2, m%2, m%3 + SWAP %2, %4, %3 +%endmacro + +%macro SBUTTERFLYPS 3 + unpcklps m%3, m%1, m%2 + unpckhps m%1, m%1, m%2 + SWAP %1, %3, %2 +%endmacro + +%macro TRANSPOSE4x4B 5 + SBUTTERFLY bw, %1, %2, %5 + SBUTTERFLY bw, %3, %4, %5 + SBUTTERFLY wd, %1, %3, %5 + SBUTTERFLY wd, %2, %4, %5 + SWAP %2, %3 +%endmacro + +%macro TRANSPOSE4x4W 5 + SBUTTERFLY wd, %1, %2, %5 + SBUTTERFLY wd, %3, %4, %5 + SBUTTERFLY dq, %1, %3, %5 + SBUTTERFLY dq, %2, %4, %5 + SWAP %2, %3 +%endmacro + +%macro TRANSPOSE2x4x4W 5 + SBUTTERFLY wd, %1, %2, %5 + SBUTTERFLY wd, %3, %4, %5 + SBUTTERFLY dq, %1, %3, %5 + SBUTTERFLY dq, %2, %4, %5 + SBUTTERFLY qdq, %1, %2, %5 + SBUTTERFLY qdq, %3, %4, %5 +%endmacro + +%macro TRANSPOSE4x4D 5 + SBUTTERFLY dq, %1, %2, %5 + SBUTTERFLY dq, %3, %4, %5 + SBUTTERFLY qdq, %1, %3, %5 + SBUTTERFLY qdq, %2, %4, %5 + SWAP %2, %3 +%endmacro + +; identical behavior to TRANSPOSE4x4D, but using SSE1 float ops +%macro TRANSPOSE4x4PS 5 + SBUTTERFLYPS %1, %2, %5 + SBUTTERFLYPS %3, %4, %5 + movlhps m%5, m%1, m%3 + movhlps m%3, m%1 + SWAP %5, %1 + movlhps m%5, m%2, m%4 + movhlps m%4, m%2 + SWAP %5, %2, %3 +%endmacro + +%macro TRANSPOSE8x8W 9-11 +%if ARCH_X86_64 + SBUTTERFLY wd, %1, %2, %9 + SBUTTERFLY wd, %3, %4, %9 + SBUTTERFLY wd, %5, %6, %9 + SBUTTERFLY wd, %7, %8, %9 + SBUTTERFLY dq, %1, %3, %9 + SBUTTERFLY dq, %2, %4, %9 + SBUTTERFLY dq, %5, %7, %9 + SBUTTERFLY dq, %6, %8, %9 + SBUTTERFLY qdq, %1, %5, %9 + SBUTTERFLY qdq, %2, %6, %9 + SBUTTERFLY qdq, %3, %7, %9 + SBUTTERFLY qdq, %4, %8, %9 + SWAP %2, %5 + SWAP %4, %7 +%else +; in: m0..m7, unless %11 in which case m6 is in %9 +; out: m0..m7, unless %11 in which case m4 is in %10 +; spills into %9 and %10 +%if %0<11 + movdqa %9, m%7 +%endif + SBUTTERFLY wd, %1, %2, %7 + movdqa %10, m%2 + movdqa m%7, %9 + SBUTTERFLY wd, %3, %4, %2 + SBUTTERFLY wd, %5, %6, %2 + SBUTTERFLY wd, %7, %8, %2 + SBUTTERFLY dq, %1, %3, %2 + movdqa %9, m%3 + movdqa m%2, %10 + SBUTTERFLY dq, %2, %4, %3 + SBUTTERFLY dq, %5, %7, %3 + SBUTTERFLY dq, %6, %8, %3 + SBUTTERFLY qdq, %1, %5, %3 + SBUTTERFLY qdq, %2, %6, %3 + movdqa %10, m%2 + movdqa m%3, %9 + SBUTTERFLY qdq, %3, %7, %2 + SBUTTERFLY qdq, %4, %8, %2 + SWAP %2, %5 + SWAP %4, %7 +%if %0<11 + movdqa m%5, %10 +%endif +%endif +%endmacro + +; PABSW macro assumes %1 != %2, while ABS1/2 macros work in-place +%macro PABSW 2 +%if cpuflag(ssse3) + pabsw %1, %2 +%elif cpuflag(mmxext) + pxor %1, %1 + psubw %1, %2 + pmaxsw %1, %2 +%else + pxor %1, %1 + pcmpgtw %1, %2 + pxor %2, %1 + psubw %2, %1 + SWAP %1, %2 +%endif +%endmacro + +%macro PSIGNW_MMX 2 + pxor %1, %2 + psubw %1, %2 +%endmacro + +%macro PSIGNW_SSSE3 2 + psignw %1, %2 +%endmacro + +%macro ABS1 2 +%if cpuflag(ssse3) + pabsw %1, %1 +%elif cpuflag(mmxext) ; a, tmp + pxor %2, %2 + psubw %2, %1 + pmaxsw %1, %2 +%else ; a, tmp + pxor %2, %2 + pcmpgtw %2, %1 + pxor %1, %2 + psubw %1, %2 +%endif +%endmacro + +%macro ABS2 4 +%if cpuflag(ssse3) + pabsw %1, %1 + pabsw %2, %2 +%elif cpuflag(mmxext) ; a, b, tmp0, tmp1 + pxor %3, %3 + pxor %4, %4 + psubw %3, %1 + psubw %4, %2 + pmaxsw %1, %3 + pmaxsw %2, %4 +%else ; a, b, tmp0, tmp1 + pxor %3, %3 + pxor %4, %4 + pcmpgtw %3, %1 + pcmpgtw %4, %2 + pxor %1, %3 + pxor %2, %4 + psubw %1, %3 + psubw %2, %4 +%endif +%endmacro + +%macro ABSB 2 ; source mmreg, temp mmreg (unused for ssse3) +%if cpuflag(ssse3) + pabsb %1, %1 +%else + pxor %2, %2 + psubb %2, %1 + pminub %1, %2 +%endif +%endmacro + +%macro ABSB2 4 ; src1, src2, tmp1, tmp2 (tmp1/2 unused for SSSE3) +%if cpuflag(ssse3) + pabsb %1, %1 + pabsb %2, %2 +%else + pxor %3, %3 + pxor %4, %4 + psubb %3, %1 + psubb %4, %2 + pminub %1, %3 + pminub %2, %4 +%endif +%endmacro + +%macro ABSD2_MMX 4 + pxor %3, %3 + pxor %4, %4 + pcmpgtd %3, %1 + pcmpgtd %4, %2 + pxor %1, %3 + pxor %2, %4 + psubd %1, %3 + psubd %2, %4 +%endmacro + +%macro ABS4 6 + ABS2 %1, %2, %5, %6 + ABS2 %3, %4, %5, %6 +%endmacro + +%macro SPLATB_LOAD 3 +%if cpuflag(ssse3) + movd %1, [%2-3] + pshufb %1, %3 +%else + movd %1, [%2-3] ;to avoid crossing a cacheline + punpcklbw %1, %1 + SPLATW %1, %1, 3 +%endif +%endmacro + +%macro SPLATB_REG 3 +%if cpuflag(ssse3) + movd %1, %2d + pshufb %1, %3 +%else + movd %1, %2d + punpcklbw %1, %1 + SPLATW %1, %1, 0 +%endif +%endmacro + +%macro PALIGNR 4-5 +%if cpuflag(ssse3) +%if %0==5 + palignr %1, %2, %3, %4 +%else + palignr %1, %2, %3 +%endif +%elif cpuflag(mmx) ; [dst,] src1, src2, imm, tmp + %define %%dst %1 +%if %0==5 +%ifnidn %1, %2 + mova %%dst, %2 +%endif + %rotate 1 +%endif +%ifnidn %4, %2 + mova %4, %2 +%endif +%if mmsize==8 + psllq %%dst, (8-%3)*8 + psrlq %4, %3*8 +%else + pslldq %%dst, 16-%3 + psrldq %4, %3 +%endif + por %%dst, %4 +%endif +%endmacro + +%macro PAVGB 2 +%if cpuflag(mmxext) + pavgb %1, %2 +%elif cpuflag(3dnow) + pavgusb %1, %2 +%endif +%endmacro + +%macro PSHUFLW 1+ + %if mmsize == 8 + pshufw %1 + %else + pshuflw %1 + %endif +%endmacro + +%macro PSWAPD 2 +%if cpuflag(mmxext) + pshufw %1, %2, q1032 +%elif cpuflag(3dnowext) + pswapd %1, %2 +%elif cpuflag(3dnow) + movq %1, %2 + psrlq %1, 32 + punpckldq %1, %2 +%endif +%endmacro + +%macro DEINTB 5 ; mask, reg1, mask, reg2, optional src to fill masks from +%ifnum %5 + pand m%3, m%5, m%4 ; src .. y6 .. y4 + pand m%1, m%5, m%2 ; dst .. y6 .. y4 +%else + mova m%1, %5 + pand m%3, m%1, m%4 ; src .. y6 .. y4 + pand m%1, m%1, m%2 ; dst .. y6 .. y4 +%endif + psrlw m%2, 8 ; dst .. y7 .. y5 + psrlw m%4, 8 ; src .. y7 .. y5 +%endmacro + +%macro SUMSUB_BA 3-4 +%if %0==3 + padd%1 m%2, m%3 + padd%1 m%3, m%3 + psub%1 m%3, m%2 +%else +%if avx_enabled == 0 + mova m%4, m%2 + padd%1 m%2, m%3 + psub%1 m%3, m%4 +%else + padd%1 m%4, m%2, m%3 + psub%1 m%3, m%2 + SWAP %2, %4 +%endif +%endif +%endmacro + +%macro SUMSUB_BADC 5-6 +%if %0==6 + SUMSUB_BA %1, %2, %3, %6 + SUMSUB_BA %1, %4, %5, %6 +%else + padd%1 m%2, m%3 + padd%1 m%4, m%5 + padd%1 m%3, m%3 + padd%1 m%5, m%5 + psub%1 m%3, m%2 + psub%1 m%5, m%4 +%endif +%endmacro + +%macro SUMSUB2_AB 4 +%ifnum %3 + psub%1 m%4, m%2, m%3 + psub%1 m%4, m%3 + padd%1 m%2, m%2 + padd%1 m%2, m%3 +%else + mova m%4, m%2 + padd%1 m%2, m%2 + padd%1 m%2, %3 + psub%1 m%4, %3 + psub%1 m%4, %3 +%endif +%endmacro + +%macro SUMSUB2_BA 4 +%if avx_enabled == 0 + mova m%4, m%2 + padd%1 m%2, m%3 + padd%1 m%2, m%3 + psub%1 m%3, m%4 + psub%1 m%3, m%4 +%else + padd%1 m%4, m%2, m%3 + padd%1 m%4, m%3 + psub%1 m%3, m%2 + psub%1 m%3, m%2 + SWAP %2, %4 +%endif +%endmacro + +%macro SUMSUBD2_AB 5 +%ifnum %4 + psra%1 m%5, m%2, 1 ; %3: %3>>1 + psra%1 m%4, m%3, 1 ; %2: %2>>1 + padd%1 m%4, m%2 ; %3: %3>>1+%2 + psub%1 m%5, m%3 ; %2: %2>>1-%3 + SWAP %2, %5 + SWAP %3, %4 +%else + mova %5, m%2 + mova %4, m%3 + psra%1 m%3, 1 ; %3: %3>>1 + psra%1 m%2, 1 ; %2: %2>>1 + padd%1 m%3, %5 ; %3: %3>>1+%2 + psub%1 m%2, %4 ; %2: %2>>1-%3 +%endif +%endmacro + +%macro DCT4_1D 5 +%ifnum %5 + SUMSUB_BADC w, %4, %1, %3, %2, %5 + SUMSUB_BA w, %3, %4, %5 + SUMSUB2_AB w, %1, %2, %5 + SWAP %1, %3, %4, %5, %2 +%else + SUMSUB_BADC w, %4, %1, %3, %2 + SUMSUB_BA w, %3, %4 + mova [%5], m%2 + SUMSUB2_AB w, %1, [%5], %2 + SWAP %1, %3, %4, %2 +%endif +%endmacro + +%macro IDCT4_1D 6-7 +%ifnum %6 + SUMSUBD2_AB %1, %3, %5, %7, %6 + ; %3: %3>>1-%5 %5: %3+%5>>1 + SUMSUB_BA %1, %4, %2, %7 + ; %4: %2+%4 %2: %2-%4 + SUMSUB_BADC %1, %5, %4, %3, %2, %7 + ; %5: %2+%4 + (%3+%5>>1) + ; %4: %2+%4 - (%3+%5>>1) + ; %3: %2-%4 + (%3>>1-%5) + ; %2: %2-%4 - (%3>>1-%5) +%else +%ifidn %1, w + SUMSUBD2_AB %1, %3, %5, [%6], [%6+16] +%else + SUMSUBD2_AB %1, %3, %5, [%6], [%6+32] +%endif + SUMSUB_BA %1, %4, %2 + SUMSUB_BADC %1, %5, %4, %3, %2 +%endif + SWAP %2, %5, %4 + ; %2: %2+%4 + (%3+%5>>1) row0 + ; %3: %2-%4 + (%3>>1-%5) row1 + ; %4: %2-%4 - (%3>>1-%5) row2 + ; %5: %2+%4 - (%3+%5>>1) row3 +%endmacro + + +%macro LOAD_DIFF 5 +%ifidn %3, none + movh %1, %4 + movh %2, %5 + punpcklbw %1, %2 + punpcklbw %2, %2 + psubw %1, %2 +%else + movh %1, %4 + punpcklbw %1, %3 + movh %2, %5 + punpcklbw %2, %3 + psubw %1, %2 +%endif +%endmacro + +%macro STORE_DCT 6 + movq [%5+%6+ 0], m%1 + movq [%5+%6+ 8], m%2 + movq [%5+%6+16], m%3 + movq [%5+%6+24], m%4 + movhps [%5+%6+32], m%1 + movhps [%5+%6+40], m%2 + movhps [%5+%6+48], m%3 + movhps [%5+%6+56], m%4 +%endmacro + +%macro LOAD_DIFF_8x4P 7-10 r0,r2,0 ; 4x dest, 2x temp, 2x pointer, increment? + LOAD_DIFF m%1, m%5, m%7, [%8], [%9] + LOAD_DIFF m%2, m%6, m%7, [%8+r1], [%9+r3] + LOAD_DIFF m%3, m%5, m%7, [%8+2*r1], [%9+2*r3] + LOAD_DIFF m%4, m%6, m%7, [%8+r4], [%9+r5] +%if %10 + lea %8, [%8+4*r1] + lea %9, [%9+4*r3] +%endif +%endmacro + +%macro DIFFx2 6-7 + movh %3, %5 + punpcklbw %3, %4 + psraw %1, 6 + paddsw %1, %3 + movh %3, %6 + punpcklbw %3, %4 + psraw %2, 6 + paddsw %2, %3 + packuswb %2, %1 +%endmacro + +%macro STORE_DIFF 4 + movh %2, %4 + punpcklbw %2, %3 + psraw %1, 6 + paddsw %1, %2 + packuswb %1, %1 + movh %4, %1 +%endmacro + +%macro STORE_DIFFx2 8 ; add1, add2, reg1, reg2, zero, shift, source, stride + movh %3, [%7] + movh %4, [%7+%8] + psraw %1, %6 + psraw %2, %6 + punpcklbw %3, %5 + punpcklbw %4, %5 + paddw %3, %1 + paddw %4, %2 + packuswb %3, %5 + packuswb %4, %5 + movh [%7], %3 + movh [%7+%8], %4 +%endmacro + +%macro PMINUB 3 ; dst, src, ignored +%if cpuflag(mmxext) + pminub %1, %2 +%else ; dst, src, tmp + mova %3, %1 + psubusb %3, %2 + psubb %1, %3 +%endif +%endmacro + +%macro SPLATW 2-3 0 +%if mmsize == 16 + pshuflw %1, %2, (%3)*0x55 + punpcklqdq %1, %1 +%elif cpuflag(mmxext) + pshufw %1, %2, (%3)*0x55 +%else + %ifnidn %1, %2 + mova %1, %2 + %endif + %if %3 & 2 + punpckhwd %1, %1 + %else + punpcklwd %1, %1 + %endif + %if %3 & 1 + punpckhwd %1, %1 + %else + punpcklwd %1, %1 + %endif +%endif +%endmacro + +%macro SPLATD 1 +%if mmsize == 8 + punpckldq %1, %1 +%elif cpuflag(sse2) + pshufd %1, %1, 0 +%elif cpuflag(sse) + shufps %1, %1, 0 +%endif +%endmacro + +%macro CLIPW 3 ;(dst, min, max) + pmaxsw %1, %2 + pminsw %1, %3 +%endmacro + +%macro PMINSD_MMX 3 ; dst, src, tmp + mova %3, %2 + pcmpgtd %3, %1 + pxor %1, %2 + pand %1, %3 + pxor %1, %2 +%endmacro + +%macro PMAXSD_MMX 3 ; dst, src, tmp + mova %3, %1 + pcmpgtd %3, %2 + pand %1, %3 + pandn %3, %2 + por %1, %3 +%endmacro + +%macro CLIPD_MMX 3-4 ; src/dst, min, max, tmp + PMINSD_MMX %1, %3, %4 + PMAXSD_MMX %1, %2, %4 +%endmacro + +%macro CLIPD_SSE2 3-4 ; src/dst, min (float), max (float), unused + cvtdq2ps %1, %1 + minps %1, %3 + maxps %1, %2 + cvtps2dq %1, %1 +%endmacro + +%macro CLIPD_SSE41 3-4 ; src/dst, min, max, unused + pminsd %1, %3 + pmaxsd %1, %2 +%endmacro + +%macro VBROADCASTSS 2 ; dst xmm/ymm, src m32 +%if cpuflag(avx) + vbroadcastss %1, %2 +%else ; sse + movss %1, %2 + shufps %1, %1, 0 +%endif +%endmacro + +%macro VBROADCASTSD 2 ; dst xmm/ymm, src m64 +%if cpuflag(avx) && mmsize == 32 + vbroadcastsd %1, %2 +%elif cpuflag(sse3) + movddup %1, %2 +%else ; sse2 + movsd %1, %2 + movlhps %1, %1 +%endif +%endmacro + +%macro SHUFFLE_MASK_W 8 + %rep 8 + %if %1>=0x80 + db %1, %1 + %else + db %1*2 + db %1*2+1 + %endif + %rotate 1 + %endrep +%endmacro + +%macro PMOVSXWD 2; dst, src +%if cpuflag(sse4) + pmovsxwd %1, %2 +%else + %ifnidn %1, %2 + mova %1, %2 + %endif + punpcklwd %1, %1 + psrad %1, 16 +%endif +%endmacro + +; Wrapper for non-FMA version of fmaddps +%macro FMULADD_PS 5 + %if cpuflag(fma3) || cpuflag(fma4) + fmaddps %1, %2, %3, %4 + %elifidn %1, %4 + mulps %5, %2, %3 + addps %1, %4, %5 + %else + mulps %1, %2, %3 + addps %1, %4 + %endif +%endmacro + +; Wrapper for non-FMA version of fmaddpd +%macro FMULADD_PD 5 + %if cpuflag(fma3) || cpuflag(fma4) + fmaddpd %1, %2, %3, %4 + %elifidn %1, %4 + mulpd %5, %2, %3 + addpd %1, %4, %5 + %else + mulpd %1, %2, %3 + addpd %1, %4 + %endif +%endmacro |