diff options
author | Anton Khirnov <anton@khirnov.net> | 2017-11-16 13:11:07 +0100 |
---|---|---|
committer | Anton Khirnov <anton@khirnov.net> | 2017-11-19 16:30:14 +0100 |
commit | 0007a0b0c11fa7c12b228883453368f105a4324b (patch) | |
tree | a5ac8016c58c13668bf87931dd921bea933cf07f |
Initial commit.
The following code is present:
* the basis API
* the BiCGSTAB solver
* the pseudospectral linear system solver
* helper APIs:
- threadpool
- logging
- cpuid
-rw-r--r-- | Makefile | 45 | ||||
-rw-r--r-- | basis.c | 339 | ||||
-rw-r--r-- | basis.h | 47 | ||||
-rw-r--r-- | bicgstab.c | 411 | ||||
-rw-r--r-- | bicgstab.h | 60 | ||||
-rw-r--r-- | common.h | 42 | ||||
-rw-r--r-- | config.asm | 1325 | ||||
-rw-r--r-- | config.h | 12 | ||||
-rw-r--r-- | cpu.c | 220 | ||||
-rw-r--r-- | cpu.h | 130 | ||||
-rw-r--r-- | cpuid.asm | 63 | ||||
-rw-r--r-- | log.c | 40 | ||||
-rw-r--r-- | log.h | 32 | ||||
-rw-r--r-- | pssolve.c | 521 | ||||
-rw-r--r-- | pssolve.h | 188 | ||||
-rw-r--r-- | tests/pssolve.c | 139 | ||||
-rw-r--r-- | threadpool.c | 178 | ||||
-rw-r--r-- | threadpool.h | 32 | ||||
-rw-r--r-- | x86inc.asm | 1544 | ||||
-rw-r--r-- | x86util.asm | 695 |
20 files changed, 6063 insertions, 0 deletions
diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..6affc77 --- /dev/null +++ b/Makefile @@ -0,0 +1,45 @@ +TARGET = libteukolskydata.so + +CFLAGS = -std=c99 -D_XOPEN_SOURCE=700 -fPIC -g -I. +TARGET_LDFLAGS = -Wl,--version-script=libteukolskydata.v -shared -lm -llapacke +TEST_LIBS = -lm -llapacke -lcblas -lpthread + +OBJS = basis.o \ + bicgstab.o \ + cpu.o \ + cpuid.o \ + log.o \ + pssolve.o \ + threadpool.o \ + +TESTPROGS = pssolve + +TESTPROGS := $(addprefix tests/,$(TESTPROGS)) + +all: $(TARGET) + +$(TARGET): $(OBJS) + cc ${TARGET_LDFLAGS} -o $@ $(OBJS) + +%.o: %.c + cc $(CFLAGS) -MMD -MF $(@:.o=.d) -MT $@ -c -o $@ $< + +%.o: %.asm + yasm -f elf -m amd64 -M $< > $(@:.o=.d) + yasm -f elf -m amd64 -o $@ $< + +clean: + -rm -f *.o *.d *.pyc $(TARGET) + +tests/%.o: tests/%.c + cc $(CFLAGS) -MMD -MF $(@:.o=.d) -MT $@ -c -o $@ $< + +tests/%: tests/%.o $(OBJS) + cc -o $@ $(@:=.o) $(OBJS) $(TEST_LIBS) + +test: $(TARGET) $(TESTPROGS) + LD_LIBRARY_PATH=. PYTHONPATH=. ./tests/convergence.py + +-include $(OBJS:.o=.d) + +.PHONY: clean test @@ -0,0 +1,339 @@ +/* + * Basis sets for pseudospectral methods + * Copyright (C) 2016 Anton Khirnov <anton@khirnov.net> + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#include "config.h" + +#include <errno.h> +#include <math.h> + +#include "basis.h" +#include "common.h" + +typedef struct BasisSet { + /* evaluate the idx-th basis function at the specified point*/ + double (*eval) (const BasisSetContext *s, double coord, unsigned int idx); + /* evaluate the first derivative of the idx-th basis function at the specified point*/ + double (*eval_diff1)(const BasisSetContext *s, double coord, unsigned int idx); + /* evaluate the second derivative of the idx-th basis function at the specified point*/ + double (*eval_diff2)(const BasisSetContext *s, double coord, unsigned int idx); + /** + * Get the idx-th collocation point for the specified order. + * idx runs from 0 to order - 1 (inclusive) + */ + double (*colloc_point)(const BasisSetContext *s, unsigned int order, unsigned int idx); +} BasisSet; + +struct BasisSetContext { + const BasisSet *bs; + double sf; +}; + +/* + * The basis of even (n = 2 * idx) SB functions (Boyd 2000, Ch 17.9) + * SB(x, n) = sin((n + 1) arccot(|x| / L)) + * They are symmetric wrt origin and decay as 1/x in infinity. + */ +static double sb_even_eval(const BasisSetContext *s, double coord, unsigned int idx) +{ + double val = atan2(s->sf, coord); + + idx *= 2; // even only + + return sin((idx + 1) * val); +} + +static double sb_even_eval_diff1(const BasisSetContext *s, double coord, unsigned int idx) +{ + double val = atan2(s->sf, coord); + + idx *= 2; // even only + + return -s->sf * (idx + 1) * cos((idx + 1) * val) / (SQR(s->sf) + SQR(coord)); +} + +static double sb_even_eval_diff2(const BasisSetContext *s, double coord, unsigned int idx) +{ + const double sf = s->sf; + double val = atan2(sf, coord); + + idx *= 2; // even only + + return sf * (idx + 1) * (2 * coord * cos((idx + 1) * val) - sf * (idx + 1) * sin((idx + 1) * val)) / SQR(SQR(sf) + SQR(coord)); +} + +static double sb_even_colloc_point(const BasisSetContext *s, unsigned int order, unsigned int idx) +{ + double t; + + idx = order - idx - 1; + //order *= 2; + + //t = (idx + 2) * M_PI / (order + 4); +#if TD_POLAR + t = (idx + 2) * M_PI / (2 * order + 3); +#else + t = (idx + 2) * M_PI / (2 * order + 2); +#endif + return s->sf / tan(t); +} + +static const BasisSet sb_even_basis = { + .eval = sb_even_eval, + .eval_diff1 = sb_even_eval_diff1, + .eval_diff2 = sb_even_eval_diff2, + .colloc_point = sb_even_colloc_point, +}; + +static double sb_odd_eval(const BasisSetContext *s, double coord, unsigned int idx) +{ + double val = atan2(s->sf, coord); + + idx = 2 * idx + 2; // odd only + + return sin((idx) * val); +} + +static double sb_odd_eval_diff1(const BasisSetContext *s, double coord, unsigned int idx) +{ + double val = atan2(s->sf, coord); + + idx = 2 * idx + 2; // odd only + + return -s->sf * (idx) * cos((idx) * val) / (SQR(s->sf) + SQR(coord)); +} + +static double sb_odd_eval_diff2(const BasisSetContext *s, double coord, unsigned int idx) +{ + const double sf = s->sf; + double val = atan2(sf, coord); + + idx = 2 * idx + 2; // odd only + + return sf * (idx) * (2 * coord * cos((idx) * val) - sf * (idx) * sin((idx) * val)) / SQR(SQR(sf) + SQR(coord)); +} + +static double sb_odd_colloc_point(const BasisSetContext *s, unsigned int order, unsigned int idx) +{ + double t; + + idx = order - idx - 1; + //order *= 2; + + //t = (idx + 2) * M_PI / (order + 4); +#if TD_POLAR + t = (idx + 2) * M_PI / (2 * order + 3); +#else + t = (idx + 2) * M_PI / (2 * order + 3); +#endif + return s->sf / tan(t); +} + +static const BasisSet sb_odd_basis = { + .eval = sb_odd_eval, + .eval_diff1 = sb_odd_eval_diff1, + .eval_diff2 = sb_odd_eval_diff2, + .colloc_point = sb_odd_colloc_point, +}; + +static double tb_even_eval(const BasisSetContext *s, double coord, unsigned int idx) +{ + double val = (coord == 0.0) ? M_PI_2 : atan(s->sf / fabs(coord)); + + idx++; + idx *= 2; // even only + + return cos(idx * val) - 1.0; +} + +static double tb_even_eval_diff1(const BasisSetContext *s, double coord, unsigned int idx) +{ + double val = (coord == 0.0) ? M_PI_2 : atan(s->sf / fabs(coord)); + + idx++; + idx *= 2; // even only + + return s->sf * idx * SGN(coord) * sin(idx * val) / (SQR(s->sf) + SQR(coord)); +} + +static double tb_even_eval_diff2(const BasisSetContext *s, double coord, unsigned int idx) +{ + const double sf = s->sf; + double val = (coord == 0.0) ? M_PI_2 : atan(sf / fabs(coord)); + + idx++; + idx *= 2; // even only + + return -sf * idx * SGN(coord) * (2 * fabs(coord) * sin(idx * val) + sf * idx * cos(idx * val)) / SQR(SQR(sf) + SQR(coord)); +} + +static double tb_even_colloc_point(const BasisSetContext *s, unsigned int order, unsigned int idx) +{ + double t; + + idx = order - idx - 1; + //order *= 2; + + //t = (idx + 2) * M_PI / (order + 4); + t = (idx + 2) * M_PI / (2 * order + 4); + return s->sf / tan(t); +} + +static const BasisSet tb_even_basis = { + .eval = tb_even_eval, + .eval_diff1 = tb_even_eval_diff1, + .eval_diff2 = tb_even_eval_diff2, + .colloc_point = tb_even_colloc_point, +}; + +static double cos_eval(const BasisSetContext *s, double coord, unsigned int idx) +{ + return cos(idx * coord); +} + +static double cos_eval_diff1(const BasisSetContext *s, double coord, unsigned int idx) +{ + return -1.0 * idx * sin(idx * coord); +} + +static double cos_eval_diff2(const BasisSetContext *s, double coord, unsigned int idx) +{ + return -1.0 * SQR(idx) * cos(idx * coord); +} + +static double cos_colloc_point(const BasisSetContext *s, unsigned int order, unsigned int idx) +{ + return M_PI * (idx + 1) / (order + 2); +} + +static const BasisSet cos_basis = { + .eval = cos_eval, + .eval_diff1 = cos_eval_diff1, + .eval_diff2 = cos_eval_diff2, + .colloc_point = cos_colloc_point, +}; + +static double cos_even_eval(const BasisSetContext *s, double coord, unsigned int idx) +{ + return cos(2 * idx * coord); +} + +static double cos_even_eval_diff1(const BasisSetContext *s, double coord, unsigned int idx) +{ + return -2.0 * idx * sin(2.0 * idx * coord); +} + +static double cos_even_eval_diff2(const BasisSetContext *s, double coord, unsigned int idx) +{ + return -4.0 * SQR(idx) * cos(2.0 * idx * coord); +} + +static double cos_even_colloc_point(const BasisSetContext *s, unsigned int order, unsigned int idx) +{ + return M_PI * (idx + 1) / (2 * order + 2); +} + +static const BasisSet cos_even_basis = { + .eval = cos_even_eval, + .eval_diff1 = cos_even_eval_diff1, + .eval_diff2 = cos_even_eval_diff2, + .colloc_point = cos_even_colloc_point, +}; + +static double cos_4_eval(const BasisSetContext *s, double coord, unsigned int idx) +{ + return cos(4 * idx * coord); +} + +static double cos_4_eval_diff1(const BasisSetContext *s, double coord, unsigned int idx) +{ + return -4.0 * idx * sin(4.0 * idx * coord); +} + +static double cos_4_eval_diff2(const BasisSetContext *s, double coord, unsigned int idx) +{ + return -16.0 * SQR(idx) * cos(4.0 * idx * coord); +} + +static double cos_4_colloc_point(const BasisSetContext *s, unsigned int order, unsigned int idx) +{ + return M_PI * (idx + 1) / (4 * order + 4); +} + +static const BasisSet cos_4_basis = { + .eval = cos_4_eval, + .eval_diff1 = cos_4_eval_diff1, + .eval_diff2 = cos_4_eval_diff2, + .colloc_point = cos_4_colloc_point, +}; + +double tdi_basis_eval(const BasisSetContext *s, enum BSEvalType type, + double coord, unsigned int order) +{ + double (*eval)(const BasisSetContext *, double, unsigned int) = NULL; + + switch (type) { + case BS_EVAL_TYPE_VALUE: eval = s->bs->eval; break; + case BS_EVAL_TYPE_DIFF1: eval = s->bs->eval_diff1; break; + case BS_EVAL_TYPE_DIFF2: eval = s->bs->eval_diff2; break; + } + + return eval(s, coord, order); +} + +double tdi_basis_colloc_point(const BasisSetContext *s, unsigned int order, + unsigned int idx) +{ + return s->bs->colloc_point(s, order, idx); +} + +void tdi_basis_free(BasisSetContext **pctx) +{ + BasisSetContext *ctx = *pctx; + + if (!ctx) + return; + + free(ctx); + *pctx = NULL; +} + +int tdi_basis_init(BasisSetContext **pctx, enum BasisFamily family, double sf) +{ + BasisSetContext *ctx; + + ctx = calloc(1, sizeof(*ctx)); + if (!ctx) + return -ENOMEM; + + switch (family) { + case BASIS_FAMILY_TB_EVEN: ctx->bs = &tb_even_basis; break; + case BASIS_FAMILY_SB_EVEN: ctx->bs = &sb_even_basis; break; + case BASIS_FAMILY_SB_ODD: ctx->bs = &sb_odd_basis; break; + case BASIS_FAMILY_COS: ctx->bs = &cos_basis; break; + case BASIS_FAMILY_COS_EVEN: ctx->bs = &cos_even_basis; break; + case BASIS_FAMILY_COS_4: ctx->bs = &cos_4_basis; break; + default: + free(ctx); + return -EINVAL; + } + + ctx->sf = sf; + + *pctx = ctx; + return 0; +} @@ -0,0 +1,47 @@ +/* + * Basis sets for pseudospectral methods + * Copyright 2017 Anton Khirnov <anton@khirnov.net> + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef TEUKOLSKY_DATA_BASIS_H +#define TEUKOLSKY_DATA_BASIS_H + +enum BSEvalType { + BS_EVAL_TYPE_VALUE, + BS_EVAL_TYPE_DIFF1, + BS_EVAL_TYPE_DIFF2, +}; + +enum BasisFamily { + BASIS_FAMILY_TB_EVEN, + BASIS_FAMILY_SB_EVEN, + BASIS_FAMILY_SB_ODD, + BASIS_FAMILY_COS, + BASIS_FAMILY_COS_EVEN, + BASIS_FAMILY_COS_4, +}; + +typedef struct BasisSetContext BasisSetContext; + +int tdi_basis_init(BasisSetContext **ctx, enum BasisFamily family, double sf); +void tdi_basis_free(BasisSetContext **ctx); + +double tdi_basis_eval(const BasisSetContext *ctx, enum BSEvalType type, + double coord, unsigned int order); +double tdi_basis_colloc_point(const BasisSetContext *ctx, unsigned int order, + unsigned int idx); + +#endif /* TEUKOLSKY_DATA_BASIS_H */ diff --git a/bicgstab.c b/bicgstab.c new file mode 100644 index 0000000..9b4d330 --- /dev/null +++ b/bicgstab.c @@ -0,0 +1,411 @@ +/* + * BiCGStab iterative linear system solver + * Copyright (C) 2016 Anton Khirnov <anton@khirnov.net> + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#include "config.h" + +#if HAVE_OPENCL +#include <cl.h> +#include <clBLAS.h> +#endif + +#include <cblas.h> +#include <errno.h> +#include <stdlib.h> +#include <string.h> + +#include "bicgstab.h" +#include "common.h" + +#define BICGSTAB_MAXITER 16 +#define BICGSTAB_TOL (1e-15) + +struct BiCGStabContext { + int N; + + double *x; + double *p, *v, *y, *z, *t; + double *res, *res0; + double *k; + +#if HAVE_OPENCL + cl_context ocl_ctx; + cl_command_queue ocl_queue; + + cl_mem cl_x; + cl_mem cl_p, cl_v, cl_y, cl_z, cl_t; + cl_mem cl_res, cl_res0; + cl_mem cl_k, cl_mat; + cl_mem cl_rho, cl_alpha, cl_beta, cl_omega, cl_omega1; + cl_mem cl_tmp, cl_tmp1; +#endif +}; + +#if HAVE_OPENCL +static int solve_cl(BiCGStabContext *ctx, + const double *mat, const double *rhs, double *x) +{ + cl_command_queue ocl_q = ctx->ocl_queue; + const int N = ctx->N; + const double rhs_norm = cblas_dnrm2(N, rhs, 1); + + double rho, rho_prev = 1.0; + double omega[2] = { 1.0 }; + double alpha = 1.0; + + double err; + int i; + + cl_event events[8]; + + // upload the matrix and RHS + clEnqueueWriteBuffer(ocl_q, ctx->cl_res, 0, 0, N * sizeof(double), rhs, 0, NULL, &events[0]); + clEnqueueWriteBuffer(ocl_q, ctx->cl_mat, 0, 0, N * N * sizeof(double), mat, 0, NULL, &events[1]); + + // initialize the residual + clblasDgemv(CblasColMajor, CblasNoTrans, N, N, -1.0, + ctx->cl_mat, 0, N, ctx->cl_x, 0, 1, 1.0, ctx->cl_res, 0, 1, + 1, &ocl_q, 2, events, &events[2]); + clEnqueueCopyBuffer(ocl_q, ctx->cl_res, ctx->cl_res0, 0, 0, N * sizeof(double), + 1, &events[2], &events[3]); + clEnqueueCopyBuffer(ocl_q, ctx->cl_res, ctx->cl_p, 0, 0, N * sizeof(double), + 1, &events[2], &events[4]); + + clWaitForEvents(5, events); + // BARRIER + + for (i = 0; i < MAXITER; i++) { + clblasDdot(N, ctx->cl_rho, 0, ctx->cl_res, 0, 1, ctx->cl_res0, 0, 1, + ctx->cl_tmp, 1, &ocl_q, 0, NULL, &events[0]); + clEnqueueReadBuffer(ocl_q, ctx->cl_rho, 1, 0, sizeof(double), &rho, + 1, &events[0], NULL); + // BARRIER + + if (i) { + double beta = (rho / rho_prev) * (alpha / omega[0]); + + clblasDaxpy(N, -omega[0], ctx->cl_v, 0, 1, ctx->cl_p, 0, 1, + 1, &ocl_q, 0, NULL, &events[0]); + clblasDscal(N, beta, ctx->cl_p, 0, 1, + 1, &ocl_q, 1, &events[0], &events[1]); + clblasDaxpy(N, 1, ctx->cl_res, 0, 1, ctx->cl_p, 0, 1, + 1, &ocl_q, 1, &events[1], &events[0]); + clWaitForEvents(1, &events[0]); + // BARRIER + } + + clblasDgemv(CblasColMajor, CblasNoTrans, N, N, 1.0, + ctx->cl_k, 0, N, ctx->cl_p, 0, 1, 0.0, ctx->cl_y, 0, 1, + 1, &ocl_q, 0, NULL, &events[0]); + + clblasDgemv(CblasColMajor, CblasNoTrans, N, N, 1.0, + ctx->cl_mat, 0, N, ctx->cl_y, 0, 1, 0.0, ctx->cl_v, 0, 1, + 1, &ocl_q, 1, &events[0], &events[1]); + + clblasDdot(N, ctx->cl_alpha, 0, ctx->cl_res0, 0, 1, ctx->cl_v, 0, 1, + ctx->cl_tmp, 1, &ocl_q, 1, &events[1], &events[0]); + clEnqueueReadBuffer(ocl_q, ctx->cl_alpha, 1, 0, sizeof(double), &alpha, + 1, &events[0], NULL); + // BARRIER + + alpha = rho / alpha; + + clblasDaxpy(N, -alpha, ctx->cl_v, 0, 1, ctx->cl_res, 0, 1, + 1, &ocl_q, 0, NULL, &events[0]); + + clblasDgemv(CblasColMajor, CblasNoTrans, N, N, 1.0, + ctx->cl_k, 0, N, ctx->cl_res, 0, 1, 0.0, ctx->cl_z, 0, 1, + 1, &ocl_q, 1, &events[0], &events[1]); + clblasDgemv(CblasColMajor, CblasNoTrans, N, N, 1.0, + ctx->cl_mat, 0, N, ctx->cl_z, 0, 1, 0.0, ctx->cl_t, 0, 1, + 1, &ocl_q, 1, &events[1], &events[0]); + + clblasDdot(N, ctx->cl_omega, 0, ctx->cl_t, 0, 1, ctx->cl_res, 0, 1, + ctx->cl_tmp, 1, &ocl_q, 1, &events[0], &events[1]); + clblasDdot(N, ctx->cl_omega, 1, ctx->cl_t, 0, 1, ctx->cl_t, 0, 1, + ctx->cl_tmp1, 1, &ocl_q, 1, &events[0], &events[2]); + + clEnqueueReadBuffer(ocl_q, ctx->cl_omega, 1, 0, sizeof(omega), omega, + 2, &events[1], NULL); + // BARRIER + + omega[0] /= omega[1]; + + clblasDaxpy(N, alpha, ctx->cl_y, 0, 1, ctx->cl_x, 0, 1, + 1, &ocl_q, 0, NULL, &events[0]); + clblasDaxpy(N, omega[0], ctx->cl_z, 0, 1, ctx->cl_x, 0, 1, + 1, &ocl_q, 1, &events[0], &events[1]); + + clblasDaxpy(N, -omega[0], ctx->cl_t, 0, 1, ctx->cl_res, 0, 1, + 1, &ocl_q, 0, NULL, &events[0]); + clblasDnrm2(N, ctx->cl_tmp, 0, ctx->cl_res, 0, 1, ctx->cl_tmp1, + 1, &ocl_q, 1, &events[0], &events[2]); + clEnqueueReadBuffer(ocl_q, ctx->cl_tmp, 1, 0, sizeof(double), &err, + 1, &events[2], NULL); + clWaitForEvents(1, &events[1]); + // BARRIER + + if (err < BICGSTAB_TOL) + break; + + rho_prev = rho; + } + if (i == BICGSTAB_MAXITER) + return -1; + + clEnqueueReadBuffer(ocl_q, ctx->cl_x, 1, 0, sizeof(double) * N, + x, 0, NULL, NULL); + return i; +} +#endif + +// based on the wikipedia article +// and http://www.netlib.org/templates/matlab/bicgstab.m +static int solve_sw(BiCGStabContext *ctx, + const double *mat, const double *rhs, double *x) +{ + const int N = ctx->N; + const double rhs_norm = cblas_dnrm2(N, rhs, 1); + + double rho, rho_prev = 1.0; + double omega = 1.0; + double alpha = 1.0; + + double err; + int i; + + double *k = ctx->k; + double *p = ctx->p, *v = ctx->v, *y = ctx->y, *z = ctx->z, *t = ctx->t; + double *res = ctx->res, *res0 = ctx->res0; + + // initialize the residual + memcpy(res, rhs, N * sizeof(*res)); + cblas_dgemv(CblasColMajor, CblasNoTrans, N, N, -1.0, + mat, N, ctx->x, 1, 1.0, res, 1); + + memcpy(res0, res, N * sizeof(*res0)); + memcpy(p, res, N * sizeof(*p)); + + for (i = 0; i < BICGSTAB_MAXITER; i++) { + rho = cblas_ddot(N, res, 1, res0, 1); + + if (i) { + double beta = (rho / rho_prev) * (alpha / omega); + + cblas_daxpy(N, -omega, v, 1, p, 1); + cblas_dscal(N, beta, p, 1); + cblas_daxpy(N, 1, res, 1, p, 1); + } + + cblas_dgemv(CblasColMajor, CblasNoTrans, N, N, 1.0, + k, N, p, 1, 0.0, y, 1); + + cblas_dgemv(CblasColMajor, CblasNoTrans, N, N, 1.0, + mat, N, y, 1, 0.0, v, 1); + + alpha = rho / cblas_ddot(N, res0, 1, v, 1); + + cblas_daxpy(N, -alpha, v, 1, res, 1); + + cblas_dgemv(CblasColMajor, CblasNoTrans, N, N, 1.0, + k, N, res, 1, 0.0, z, 1); + cblas_dgemv(CblasColMajor, CblasNoTrans, N, N, 1.0, + mat, N, z, 1, 0.0, t, 1); + + omega = cblas_ddot(N, t, 1, res, 1) / cblas_ddot(N, t, 1, t, 1); + + cblas_daxpy(N, alpha, y, 1, ctx->x, 1); + cblas_daxpy(N, omega, z, 1, ctx->x, 1); + + cblas_daxpy(N, -omega, t, 1, res, 1); + + err = cblas_dnrm2(N, res, 1) / rhs_norm; + if (err < BICGSTAB_TOL) + break; + + rho_prev = rho; + } + if (i == BICGSTAB_MAXITER) + return -1; + + memcpy(x, ctx->x, sizeof(*x) * ctx->N); + + return i; +} + +int tdi_bicgstab_solve(BiCGStabContext *ctx, const double *mat, const double *rhs, double *x) +{ + int ret; + +#if HAVE_OPENCL + if (ctx->ocl_ctx) + ret = solve_cl(ctx, mat, rhs, x); + else +#endif + ret = solve_sw(ctx, mat, rhs, x); + if (ret < 0) + return ret; + +#if MD_VERIFY + { + int i; + double *y; + + y = malloc(sizeof(*y) * ctx->N); + memcpy(y, rhs, sizeof(*y) * ctx->N); + cblas_dgemv(CblasColMajor, CblasNoTrans, ctx->N, ctx->N, -1.0, + mat, ctx->N, x, 1, 1.0, y, 1); + i = cblas_idamax(ctx->N, y, 1); + if (fabs(y[i]) > 1e-11) + abort(); + } +#endif + + return ret; +} + +int tdi_bicgstab_init(BiCGStabContext *ctx, const double *k, const double *x0) +{ +#if HAVE_OPENCL + if (ctx->ocl_ctx) { + cl_event events[2]; + clEnqueueWriteBuffer(ctx->ocl_queue, ctx->cl_k, 0, 0, ctx->N * ctx->N * sizeof(double), + k, 0, NULL, &events[0]); + clEnqueueWriteBuffer(ctx->ocl_queue, ctx->cl_x, 0, 0, ctx->N * sizeof(double), + x0, 0, NULL, &events[1]); + clWaitForEvents(2, events); + } else +#endif + { + memcpy(ctx->x, x0, ctx->N * sizeof(*x0)); + memcpy(ctx->k, k, ctx->N * ctx->N * sizeof(*k)); + } + + return 0; +} + +int tdi_bicgstab_context_alloc(BiCGStabContext **pctx, int N, + cl_context ocl_ctx, cl_command_queue ocl_q) +{ + BiCGStabContext *ctx; + int ret = 0; + + ctx = calloc(1, sizeof(*ctx)); + if (!ctx) + return -ENOMEM; + + ctx->N = N; + +#if HAVE_OPENCL + if (ocl_ctx) { + ctx->ocl_ctx = ocl_ctx; + ctx->ocl_queue = ocl_q; + +#define ALLOC(dst, size) \ +do { \ + ctx->dst = clCreateBuffer(ocl_ctx, 0, size, NULL, &ret); \ + if (ret != CL_SUCCESS) \ + goto fail; \ +} while (0) + + ALLOC(cl_x, N * sizeof(double)); + ALLOC(cl_p, N * sizeof(double)); + ALLOC(cl_v, N * sizeof(double)); + ALLOC(cl_y, N * sizeof(double)); + ALLOC(cl_z, N * sizeof(double)); + ALLOC(cl_t, N * sizeof(double)); + ALLOC(cl_res, N * sizeof(double)); + ALLOC(cl_res0, N * sizeof(double)); + ALLOC(cl_tmp, N * sizeof(double)); + ALLOC(cl_tmp1, N * 2 * sizeof(double)); + + ALLOC(cl_k, N * N * sizeof(double)); + ALLOC(cl_mat, N * N * sizeof(double)); + + ALLOC(cl_rho, sizeof(double)); + ALLOC(cl_alpha, sizeof(double)); + ALLOC(cl_beta, sizeof(double)); + ALLOC(cl_omega, 2 * sizeof(double)); + ALLOC(cl_omega1, sizeof(double)); + } else +#endif + { + ret |= posix_memalign((void**)&ctx->x, 32, sizeof(double) * N); + ret |= posix_memalign((void**)&ctx->p, 32, sizeof(double) * N); + ret |= posix_memalign((void**)&ctx->v, 32, sizeof(double) * N); + ret |= posix_memalign((void**)&ctx->y, 32, sizeof(double) * N); + ret |= posix_memalign((void**)&ctx->z, 32, sizeof(double) * N); + ret |= posix_memalign((void**)&ctx->t, 32, sizeof(double) * N); + ret |= posix_memalign((void**)&ctx->res, 32, sizeof(double) * N); + ret |= posix_memalign((void**)&ctx->res0, 32, sizeof(double) * N); + ret |= posix_memalign((void**)&ctx->k, 32, sizeof(double) * N * N); + } + +fail: + if (ret) { + tdi_bicgstab_context_free(&ctx); + return -ENOMEM; + } + + *pctx = ctx; + return 0; +} + +void tdi_bicgstab_context_free(BiCGStabContext **pctx) +{ + BiCGStabContext *ctx = *pctx; + + if (!ctx) + return; + + free(ctx->x); + free(ctx->p); + free(ctx->v); + free(ctx->y); + free(ctx->z); + free(ctx->t); + free(ctx->res); + free(ctx->res0); + free(ctx->k); + +#if HAVE_OPENCL + if (ctx->ocl_ctx) { + clReleaseMemObject(ctx->cl_x); + clReleaseMemObject(ctx->cl_p); + clReleaseMemObject(ctx->cl_v); + clReleaseMemObject(ctx->cl_y); + clReleaseMemObject(ctx->cl_z); + clReleaseMemObject(ctx->cl_t); + clReleaseMemObject(ctx->cl_res); + clReleaseMemObject(ctx->cl_res0); + clReleaseMemObject(ctx->cl_tmp); + clReleaseMemObject(ctx->cl_tmp1); + + clReleaseMemObject(ctx->cl_k); + clReleaseMemObject(ctx->cl_mat); + + clReleaseMemObject(ctx->cl_rho); + clReleaseMemObject(ctx->cl_alpha); + clReleaseMemObject(ctx->cl_beta); + clReleaseMemObject(ctx->cl_omega); + clReleaseMemObject(ctx->cl_omega1); + } +#endif + + free(ctx); + *pctx = NULL; +} diff --git a/bicgstab.h b/bicgstab.h new file mode 100644 index 0000000..338a4a9 --- /dev/null +++ b/bicgstab.h @@ -0,0 +1,60 @@ +/* + * BiCGStab iterative linear system solver + * Copyright (C) 2016 Anton Khirnov <anton@khirnov.net> + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef TEUKOLSKY_DATA_BICGSTAB_H +#define TEUKOLSKY_DATA_BICGSTAB_H + +#include "config.h" + +#if HAVE_OPENCL +#include <cl.h> +#else +typedef void* cl_context; +typedef void* cl_command_queue; +#endif + +typedef struct BiCGStabContext BiCGStabContext; + +/** + * Allocate and initialize the solver for the NxN system. + * + * If the OpenCL context and command queue are provided (non-NULL), the solver + * will run using clBLAS. + */ +int tdi_bicgstab_context_alloc(BiCGStabContext **ctx, int N, + cl_context ocl_ctx, cl_command_queue ocl_q); + +/** + * Free the solver and all its internal state. + */ +void tdi_bicgstab_context_free(BiCGStabContext **ctx); + +/** + * Initialise the solver with the given preconditioner matrix. This function + * may be any number of times on a given solver context. + */ +int tdi_bicgstab_init(BiCGStabContext *ctx, const double *k, const double *x0); + +/** + * Solve the linear system + * mat ยท x = rhs + * The result is written into x. + */ +int tdi_bicgstab_solve(BiCGStabContext *ctx, const double *mat, const double *rhs, double *x); + +#endif /* TEUKOLSKY_DATA_BICGSTAB_H */ diff --git a/common.h b/common.h new file mode 100644 index 0000000..16dd7ef --- /dev/null +++ b/common.h @@ -0,0 +1,42 @@ +/* + * Copyright 2017 Anton Khirnov <anton@khirnov.net> + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef TEUKOLSKY_DATA_COMMON_H +#define TEUKOLSKY_DATA_COMMON_H + +#define SQR(x) ((x) * (x)) +#define SGN(x) ((x) >= 0.0 ? 1.0 : -1.0) +#define MAX(x, y) ((x) > (y) ? (x) : (y)) +#define MIN(x, y) ((x) > (y) ? (y) : (x)) +#define ARRAY_ELEMS(arr) (sizeof(arr) / sizeof(*arr)) + +/* + * small number to avoid r=0 singularities + */ +#define EPS 1E-08 + +#include <stdlib.h> +#include <stdint.h> +#include <sys/time.h> +static inline int64_t gettime(void) +{ + struct timeval tv; + gettimeofday(&tv, NULL); + return (int64_t)tv.tv_sec * 1000000 + tv.tv_usec; +} + +#endif /* TEUKOLSKY_DATA_COMMON_H */ diff --git a/config.asm b/config.asm new file mode 100644 index 0000000..0ee0ca2 --- /dev/null +++ b/config.asm @@ -0,0 +1,1325 @@ +%define ARCH_AARCH64 0 +%define ARCH_ALPHA 0 +%define ARCH_ARM 0 +%define ARCH_AVR32 0 +%define ARCH_AVR32_AP 0 +%define ARCH_AVR32_UC 0 +%define ARCH_BFIN 0 +%define ARCH_IA64 0 +%define ARCH_M68K 0 +%define ARCH_MIPS 0 +%define ARCH_MIPS64 0 +%define ARCH_PARISC 0 +%define ARCH_PPC 0 +%define ARCH_PPC64 0 +%define ARCH_S390 0 +%define ARCH_SH4 0 +%define ARCH_SPARC 0 +%define ARCH_SPARC64 0 +%define ARCH_TILEGX 0 +%define ARCH_TILEPRO 0 +%define ARCH_TOMI 0 +%define ARCH_X86 1 +%define ARCH_X86_32 0 +%define ARCH_X86_64 1 +%define HAVE_ARMV5TE 0 +%define HAVE_ARMV6 0 +%define HAVE_ARMV6T2 0 +%define HAVE_ARMV8 0 +%define HAVE_NEON 0 +%define HAVE_VFP 0 +%define HAVE_VFPV3 0 +%define HAVE_ALTIVEC 0 +%define HAVE_DCBZL 1 +%define HAVE_LDBRX 1 +%define HAVE_PPC4XX 0 +%define HAVE_AMD3DNOW 1 +%define HAVE_AMD3DNOWEXT 1 +%define HAVE_AVX 1 +%define HAVE_AVX2 1 +%define HAVE_FMA3 1 +%define HAVE_FMA4 1 +%define HAVE_MMX 1 +%define HAVE_MMXEXT 1 +%define HAVE_SSE 1 +%define HAVE_SSE2 1 +%define HAVE_SSE3 1 +%define HAVE_SSE4 1 +%define HAVE_SSE42 1 +%define HAVE_SSSE3 1 +%define HAVE_XOP 1 +%define HAVE_CPUNOP 1 +%define HAVE_I686 1 +%define HAVE_LOONGSON 1 +%define HAVE_VIS 1 +%define HAVE_ARMV5TE_EXTERNAL 0 +%define HAVE_ARMV6_EXTERNAL 0 +%define HAVE_ARMV6T2_EXTERNAL 0 +%define HAVE_ARMV8_EXTERNAL 0 +%define HAVE_NEON_EXTERNAL 0 +%define HAVE_VFP_EXTERNAL 0 +%define HAVE_VFPV3_EXTERNAL 0 +%define HAVE_ALTIVEC_EXTERNAL 0 +%define HAVE_DCBZL_EXTERNAL 0 +%define HAVE_LDBRX_EXTERNAL 0 +%define HAVE_PPC4XX_EXTERNAL 0 +%define HAVE_AMD3DNOW_EXTERNAL 1 +%define HAVE_AMD3DNOWEXT_EXTERNAL 1 +%define HAVE_AVX_EXTERNAL 1 +%define HAVE_AVX2_EXTERNAL 1 +%define HAVE_FMA3_EXTERNAL 1 +%define HAVE_FMA4_EXTERNAL 1 +%define HAVE_MMX_EXTERNAL 1 +%define HAVE_MMXEXT_EXTERNAL 1 +%define HAVE_SSE_EXTERNAL 1 +%define HAVE_SSE2_EXTERNAL 1 +%define HAVE_SSE3_EXTERNAL 1 +%define HAVE_SSE4_EXTERNAL 1 +%define HAVE_SSE42_EXTERNAL 1 +%define HAVE_SSSE3_EXTERNAL 1 +%define HAVE_XOP_EXTERNAL 1 +%define HAVE_CPUNOP_EXTERNAL 0 +%define HAVE_I686_EXTERNAL 0 +%define HAVE_LOONGSON_EXTERNAL 0 +%define HAVE_VIS_EXTERNAL 0 +%define HAVE_ARMV5TE_INLINE 0 +%define HAVE_ARMV6_INLINE 0 +%define HAVE_ARMV6T2_INLINE 0 +%define HAVE_ARMV8_INLINE 0 +%define HAVE_NEON_INLINE 0 +%define HAVE_VFP_INLINE 0 +%define HAVE_VFPV3_INLINE 0 +%define HAVE_ALTIVEC_INLINE 0 +%define HAVE_DCBZL_INLINE 0 +%define HAVE_LDBRX_INLINE 0 +%define HAVE_PPC4XX_INLINE 0 +%define HAVE_AMD3DNOW_INLINE 1 +%define HAVE_AMD3DNOWEXT_INLINE 1 +%define HAVE_AVX_INLINE 1 +%define HAVE_AVX2_INLINE 1 +%define HAVE_FMA3_INLINE 1 +%define HAVE_FMA4_INLINE 1 +%define HAVE_MMX_INLINE 1 +%define HAVE_MMXEXT_INLINE 1 +%define HAVE_SSE_INLINE 1 +%define HAVE_SSE2_INLINE 1 +%define HAVE_SSE3_INLINE 1 +%define HAVE_SSE4_INLINE 1 +%define HAVE_SSE42_INLINE 1 +%define HAVE_SSSE3_INLINE 1 +%define HAVE_XOP_INLINE 1 +%define HAVE_CPUNOP_INLINE 0 +%define HAVE_I686_INLINE 0 +%define HAVE_LOONGSON_INLINE 0 +%define HAVE_VIS_INLINE 0 +%define HAVE_ALIGNED_STACK 1 +%define HAVE_FAST_64BIT 1 +%define HAVE_FAST_CLZ 1 +%define HAVE_FAST_CMOV 1 +%define HAVE_LOCAL_ALIGNED_8 1 +%define HAVE_LOCAL_ALIGNED_16 1 +%define HAVE_SIMD_ALIGN_16 1 +%define HAVE_ATOMICS_GCC 1 +%define HAVE_ATOMICS_SUNCC 0 +%define HAVE_ATOMICS_WIN32 0 +%define HAVE_ATOMIC_CAS_PTR 0 +%define HAVE_MACHINE_RW_BARRIER 0 +%define HAVE_MEMORYBARRIER 0 +%define HAVE_MM_EMPTY 1 +%define HAVE_RDTSC 0 +%define HAVE_SYNC_VAL_COMPARE_AND_SWAP 1 +%define HAVE_INLINE_ASM 1 +%define HAVE_SYMVER 1 +%define HAVE_YASM 1 +%define HAVE_BIGENDIAN 0 +%define HAVE_FAST_UNALIGNED 1 +%define HAVE_ALSA_ASOUNDLIB_H 1 +%define HAVE_ALTIVEC_H 0 +%define HAVE_ARPA_INET_H 1 +%define HAVE_CDIO_PARANOIA_H 0 +%define HAVE_CDIO_PARANOIA_PARANOIA_H 0 +%define HAVE_DEV_BKTR_IOCTL_BT848_H 0 +%define HAVE_DEV_BKTR_IOCTL_METEOR_H 0 +%define HAVE_DEV_IC_BT8XX_H 0 +%define HAVE_DEV_VIDEO_BKTR_IOCTL_BT848_H 0 +%define HAVE_DEV_VIDEO_METEOR_IOCTL_METEOR_H 0 +%define HAVE_DIRECT_H 0 +%define HAVE_DLFCN_H 1 +%define HAVE_DXVA_H 0 +%define HAVE_GSM_H 0 +%define HAVE_IO_H 0 +%define HAVE_MACH_MACH_TIME_H 0 +%define HAVE_MACHINE_IOCTL_BT848_H 0 +%define HAVE_MACHINE_IOCTL_METEOR_H 0 +%define HAVE_MALLOC_H 1 +%define HAVE_POLL_H 1 +%define HAVE_SNDIO_H 0 +%define HAVE_SOUNDCARD_H 0 +%define HAVE_SYS_MMAN_H 1 +%define HAVE_SYS_PARAM_H 1 +%define HAVE_SYS_RESOURCE_H 1 +%define HAVE_SYS_SELECT_H 1 +%define HAVE_SYS_SOUNDCARD_H 1 +%define HAVE_SYS_TIME_H 1 +%define HAVE_SYS_UN_H 1 +%define HAVE_SYS_VIDEOIO_H 0 +%define HAVE_UNISTD_H 1 +%define HAVE_WINDOWS_H 0 +%define HAVE_WINSOCK2_H 0 +%define HAVE_INTRINSICS_NEON 0 +%define HAVE_ATANF 1 +%define HAVE_ATAN2F 1 +%define HAVE_CBRTF 1 +%define HAVE_COSF 1 +%define HAVE_EXP2 1 +%define HAVE_EXP2F 1 +%define HAVE_EXPF 1 +%define HAVE_ISINF 1 +%define HAVE_ISNAN 1 +%define HAVE_LDEXPF 1 +%define HAVE_LLRINT 1 +%define HAVE_LLRINTF 1 +%define HAVE_LOG2 1 +%define HAVE_LOG2F 1 +%define HAVE_LOG10F 1 +%define HAVE_LRINT 1 +%define HAVE_LRINTF 1 +%define HAVE_POWF 1 +%define HAVE_RINT 1 +%define HAVE_ROUND 1 +%define HAVE_ROUNDF 1 +%define HAVE_SINF 1 +%define HAVE_TRUNC 1 +%define HAVE_TRUNCF 1 +%define HAVE_ALIGNED_MALLOC 0 +%define HAVE_CLOSESOCKET 0 +%define HAVE_COMMANDLINETOARGVW 0 +%define HAVE_COTASKMEMFREE 0 +%define HAVE_CRYPTGENRANDOM 0 +%define HAVE_DLOPEN 1 +%define HAVE_FCNTL 1 +%define HAVE_FLT_LIM 1 +%define HAVE_FORK 1 +%define HAVE_GETADDRINFO 1 +%define HAVE_GETHRTIME 0 +%define HAVE_GETOPT 1 +%define HAVE_GETPROCESSAFFINITYMASK 0 +%define HAVE_GETPROCESSMEMORYINFO 0 +%define HAVE_GETPROCESSTIMES 0 +%define HAVE_GETRUSAGE 1 +%define HAVE_GETSERVBYPORT 1 +%define HAVE_GETSYSTEMTIMEASFILETIME 0 +%define HAVE_GETTIMEOFDAY 1 +%define HAVE_INET_ATON 1 +%define HAVE_ISATTY 1 +%define HAVE_JACK_PORT_GET_LATENCY_RANGE 0 +%define HAVE_LOCALTIME_R 1 +%define HAVE_MACH_ABSOLUTE_TIME 0 +%define HAVE_MAPVIEWOFFILE 0 +%define HAVE_MEMALIGN 1 +%define HAVE_MKSTEMP 1 +%define HAVE_MMAP 1 +%define HAVE_MPROTECT 1 +%define HAVE_NANOSLEEP 1 +%define HAVE_POSIX_MEMALIGN 1 +%define HAVE_SCHED_GETAFFINITY 1 +%define HAVE_SETCONSOLETEXTATTRIBUTE 0 +%define HAVE_SETMODE 0 +%define HAVE_SETRLIMIT 1 +%define HAVE_SLEEP 0 +%define HAVE_STRERROR_R 1 +%define HAVE_STRPTIME 1 +%define HAVE_SYSCONF 1 +%define HAVE_SYSCTL 1 +%define HAVE_USLEEP 1 +%define HAVE_VIRTUALALLOC 0 +%define HAVE_PTHREADS 1 +%define HAVE_W32THREADS 0 +%define HAVE_AS_DN_DIRECTIVE 0 +%define HAVE_AS_FUNC 1 +%define HAVE_ASM_MOD_Q 0 +%define HAVE_ATTRIBUTE_MAY_ALIAS 1 +%define HAVE_ATTRIBUTE_PACKED 1 +%define HAVE_EBP_AVAILABLE 0 +%define HAVE_EBX_AVAILABLE 1 +%define HAVE_GNU_AS 1 +%define HAVE_IBM_ASM 0 +%define HAVE_INLINE_ASM_LABELS 1 +%define HAVE_PRAGMA_DEPRECATED 1 +%define HAVE_SYMVER_ASM_LABEL 0 +%define HAVE_SYMVER_GNU_ASM 1 +%define HAVE_VFP_ARGS 0 +%define HAVE_XFORM_ASM 0 +%define HAVE_XMM_CLOBBERS 1 +%define HAVE_SOCKLEN_T 1 +%define HAVE_STRUCT_ADDRINFO 1 +%define HAVE_STRUCT_GROUP_SOURCE_REQ 1 +%define HAVE_STRUCT_IP_MREQ_SOURCE 1 +%define HAVE_STRUCT_IPV6_MREQ 1 +%define HAVE_STRUCT_POLLFD 1 +%define HAVE_STRUCT_RUSAGE_RU_MAXRSS 1 +%define HAVE_STRUCT_SOCKADDR_IN6 1 +%define HAVE_STRUCT_SOCKADDR_SA_LEN 0 +%define HAVE_STRUCT_SOCKADDR_STORAGE 1 +%define HAVE_STRUCT_V4L2_FRMIVALENUM_DISCRETE 1 +%define HAVE_ATOMICS_NATIVE 1 +%define HAVE_DOS_PATHS 0 +%define HAVE_DXVA2_LIB 0 +%define HAVE_LIBC_MSVCRT 0 +%define HAVE_LIBDC1394_1 0 +%define HAVE_LIBDC1394_2 0 +%define HAVE_SDL 0 +%define HAVE_THREADS 1 +%define HAVE_VDPAU_X11 0 +%define HAVE_XLIB 1 +%define CONFIG_BSFS 1 +%define CONFIG_DECODERS 1 +%define CONFIG_DEMUXERS 1 +%define CONFIG_ENCODERS 1 +%define CONFIG_FILTERS 1 +%define CONFIG_HWACCELS 0 +%define CONFIG_INDEVS 1 +%define CONFIG_MUXERS 1 +%define CONFIG_OUTDEVS 1 +%define CONFIG_PARSERS 1 +%define CONFIG_PROTOCOLS 1 +%define CONFIG_AVCODEC_EXAMPLE 1 +%define CONFIG_FILTER_AUDIO_EXAMPLE 1 +%define CONFIG_METADATA_EXAMPLE 1 +%define CONFIG_OUTPUT_EXAMPLE 1 +%define CONFIG_TRANSCODE_AAC_EXAMPLE 1 +%define CONFIG_AVISYNTH 0 +%define CONFIG_BZLIB 1 +%define CONFIG_FREI0R 0 +%define CONFIG_GNUTLS 0 +%define CONFIG_LIBBS2B 0 +%define CONFIG_LIBCDIO 0 +%define CONFIG_LIBDC1394 0 +%define CONFIG_LIBFAAC 0 +%define CONFIG_LIBFDK_AAC 0 +%define CONFIG_LIBFONTCONFIG 0 +%define CONFIG_LIBFREETYPE 0 +%define CONFIG_LIBGSM 0 +%define CONFIG_LIBILBC 0 +%define CONFIG_LIBMP3LAME 0 +%define CONFIG_LIBOPENCORE_AMRNB 0 +%define CONFIG_LIBOPENCORE_AMRWB 0 +%define CONFIG_LIBOPENCV 0 +%define CONFIG_LIBOPENJPEG 0 +%define CONFIG_LIBOPUS 0 +%define CONFIG_LIBPULSE 0 +%define CONFIG_LIBRTMP 0 +%define CONFIG_LIBSCHROEDINGER 0 +%define CONFIG_LIBSPEEX 0 +%define CONFIG_LIBTHEORA 0 +%define CONFIG_LIBTWOLAME 0 +%define CONFIG_LIBVO_AACENC 0 +%define CONFIG_LIBVO_AMRWBENC 0 +%define CONFIG_LIBVORBIS 0 +%define CONFIG_LIBVPX 0 +%define CONFIG_LIBWAVPACK 0 +%define CONFIG_LIBWEBP 0 +%define CONFIG_LIBX264 0 +%define CONFIG_LIBX265 0 +%define CONFIG_LIBXAVS 0 +%define CONFIG_LIBXVID 0 +%define CONFIG_OPENSSL 0 +%define CONFIG_X11GRAB 0 +%define CONFIG_ZLIB 1 +%define CONFIG_GRAY 0 +%define CONFIG_HARDCODED_TABLES 0 +%define CONFIG_RUNTIME_CPUDETECT 0 +%define CONFIG_SAFE_BITSTREAM_READER 1 +%define CONFIG_SHARED 0 +%define CONFIG_SMALL 0 +%define CONFIG_SRAM 0 +%define CONFIG_STATIC 1 +%define CONFIG_SWSCALE_ALPHA 1 +%define CONFIG_DXVA2 0 +%define CONFIG_VAAPI 0 +%define CONFIG_VDA 0 +%define CONFIG_VDPAU 0 +%define CONFIG_GPL 0 +%define CONFIG_NONFREE 0 +%define CONFIG_VERSION3 0 +%define CONFIG_AVCODEC 1 +%define CONFIG_AVDEVICE 1 +%define CONFIG_AVFILTER 1 +%define CONFIG_AVFORMAT 1 +%define CONFIG_AVRESAMPLE 1 +%define CONFIG_AVUTIL 1 +%define CONFIG_SWSCALE 1 +%define CONFIG_AVCONV 1 +%define CONFIG_AVPLAY 0 +%define CONFIG_AVPROBE 1 +%define CONFIG_DCT 1 +%define CONFIG_DOC 1 +%define CONFIG_ERROR_RESILIENCE 1 +%define CONFIG_FFT 1 +%define CONFIG_LSP 1 +%define CONFIG_LZO 1 +%define CONFIG_MDCT 1 +%define CONFIG_NETWORK 1 +%define CONFIG_RDFT 1 +%define CONFIG_MEMALIGN_HACK 0 +%define CONFIG_NEON_CLOBBER_TEST 0 +%define CONFIG_PIC 0 +%define CONFIG_POD2MAN 1 +%define CONFIG_TEXI2HTML 0 +%define CONFIG_THUMB 0 +%define CONFIG_XMM_CLOBBER_TEST 0 +%define CONFIG_AANDCTTABLES 1 +%define CONFIG_AC3DSP 1 +%define CONFIG_AUDIO_FRAME_QUEUE 1 +%define CONFIG_AUDIODSP 1 +%define CONFIG_BLOCKDSP 1 +%define CONFIG_BSWAPDSP 1 +%define CONFIG_CABAC 1 +%define CONFIG_DVPROFILE 1 +%define CONFIG_FDCTDSP 1 +%define CONFIG_GCRYPT 0 +%define CONFIG_GOLOMB 1 +%define CONFIG_GPLV3 0 +%define CONFIG_H263DSP 1 +%define CONFIG_H264CHROMA 1 +%define CONFIG_H264DSP 1 +%define CONFIG_H264PRED 1 +%define CONFIG_H264QPEL 1 +%define CONFIG_HPELDSP 1 +%define CONFIG_HUFFMAN 1 +%define CONFIG_HUFFYUVDSP 1 +%define CONFIG_HUFFYUVENCDSP 1 +%define CONFIG_IDCTDSP 1 +%define CONFIG_IIRFILTER 1 +%define CONFIG_INTRAX8 1 +%define CONFIG_LGPLV3 0 +%define CONFIG_LPC 1 +%define CONFIG_ME_CMP 1 +%define CONFIG_MPEG_ER 1 +%define CONFIG_MPEGAUDIO 1 +%define CONFIG_MPEGAUDIODSP 1 +%define CONFIG_MPEGVIDEO 1 +%define CONFIG_MPEGVIDEOENC 1 +%define CONFIG_NETTLE 0 +%define CONFIG_PIXBLOCKDSP 1 +%define CONFIG_QPELDSP 1 +%define CONFIG_RANGECODER 1 +%define CONFIG_RIFFDEC 1 +%define CONFIG_RIFFENC 1 +%define CONFIG_RTPDEC 1 +%define CONFIG_RTPENC_CHAIN 1 +%define CONFIG_SINEWIN 1 +%define CONFIG_TPELDSP 1 +%define CONFIG_VIDEODSP 1 +%define CONFIG_VP3DSP 1 +%define CONFIG_AAC_ADTSTOASC_BSF 1 +%define CONFIG_CHOMP_BSF 1 +%define CONFIG_DUMP_EXTRADATA_BSF 1 +%define CONFIG_H264_MP4TOANNEXB_BSF 1 +%define CONFIG_IMX_DUMP_HEADER_BSF 1 +%define CONFIG_MJPEG2JPEG_BSF 1 +%define CONFIG_MJPEGA_DUMP_HEADER_BSF 1 +%define CONFIG_MOV2TEXTSUB_BSF 1 +%define CONFIG_NOISE_BSF 1 +%define CONFIG_REMOVE_EXTRADATA_BSF 1 +%define CONFIG_TEXT2MOVSUB_BSF 1 +%define CONFIG_AASC_DECODER 1 +%define CONFIG_AIC_DECODER 1 +%define CONFIG_ALIAS_PIX_DECODER 1 +%define CONFIG_AMV_DECODER 1 +%define CONFIG_ANM_DECODER 1 +%define CONFIG_ANSI_DECODER 1 +%define CONFIG_ASV1_DECODER 1 +%define CONFIG_ASV2_DECODER 1 +%define CONFIG_AURA_DECODER 1 +%define CONFIG_AURA2_DECODER 1 +%define CONFIG_AVS_DECODER 1 +%define CONFIG_BETHSOFTVID_DECODER 1 +%define CONFIG_BFI_DECODER 1 +%define CONFIG_BINK_DECODER 1 +%define CONFIG_BMP_DECODER 1 +%define CONFIG_BMV_VIDEO_DECODER 1 +%define CONFIG_BRENDER_PIX_DECODER 1 +%define CONFIG_C93_DECODER 1 +%define CONFIG_CAVS_DECODER 1 +%define CONFIG_CDGRAPHICS_DECODER 1 +%define CONFIG_CDXL_DECODER 1 +%define CONFIG_CINEPAK_DECODER 1 +%define CONFIG_CLJR_DECODER 1 +%define CONFIG_CLLC_DECODER 1 +%define CONFIG_COMFORTNOISE_DECODER 1 +%define CONFIG_CSCD_DECODER 1 +%define CONFIG_CYUV_DECODER 1 +%define CONFIG_DFA_DECODER 1 +%define CONFIG_DNXHD_DECODER 1 +%define CONFIG_DPX_DECODER 1 +%define CONFIG_DSICINVIDEO_DECODER 1 +%define CONFIG_DVVIDEO_DECODER 1 +%define CONFIG_DXA_DECODER 1 +%define CONFIG_DXTORY_DECODER 1 +%define CONFIG_EACMV_DECODER 1 +%define CONFIG_EAMAD_DECODER 1 +%define CONFIG_EATGQ_DECODER 1 +%define CONFIG_EATGV_DECODER 1 +%define CONFIG_EATQI_DECODER 1 +%define CONFIG_EIGHTBPS_DECODER 1 +%define CONFIG_EIGHTSVX_EXP_DECODER 1 +%define CONFIG_EIGHTSVX_FIB_DECODER 1 +%define CONFIG_ESCAPE124_DECODER 1 +%define CONFIG_ESCAPE130_DECODER 1 +%define CONFIG_EXR_DECODER 1 +%define CONFIG_FFV1_DECODER 1 +%define CONFIG_FFVHUFF_DECODER 1 +%define CONFIG_FIC_DECODER 1 +%define CONFIG_FLASHSV_DECODER 1 +%define CONFIG_FLASHSV2_DECODER 1 +%define CONFIG_FLIC_DECODER 1 +%define CONFIG_FLV_DECODER 1 +%define CONFIG_FOURXM_DECODER 1 +%define CONFIG_FRAPS_DECODER 1 +%define CONFIG_FRWU_DECODER 1 +%define CONFIG_G2M_DECODER 1 +%define CONFIG_GIF_DECODER 1 +%define CONFIG_H261_DECODER 1 +%define CONFIG_H263_DECODER 1 +%define CONFIG_H263I_DECODER 1 +%define CONFIG_H264_DECODER 1 +%define CONFIG_HEVC_DECODER 1 +%define CONFIG_HNM4_VIDEO_DECODER 1 +%define CONFIG_HUFFYUV_DECODER 1 +%define CONFIG_IDCIN_DECODER 1 +%define CONFIG_IFF_BYTERUN1_DECODER 1 +%define CONFIG_IFF_ILBM_DECODER 1 +%define CONFIG_INDEO2_DECODER 1 +%define CONFIG_INDEO3_DECODER 1 +%define CONFIG_INDEO4_DECODER 1 +%define CONFIG_INDEO5_DECODER 1 +%define CONFIG_INTERPLAY_VIDEO_DECODER 1 +%define CONFIG_JPEG2000_DECODER 1 +%define CONFIG_JPEGLS_DECODER 1 +%define CONFIG_JV_DECODER 1 +%define CONFIG_KGV1_DECODER 1 +%define CONFIG_KMVC_DECODER 1 +%define CONFIG_LAGARITH_DECODER 1 +%define CONFIG_LOCO_DECODER 1 +%define CONFIG_MDEC_DECODER 1 +%define CONFIG_MIMIC_DECODER 1 +%define CONFIG_MJPEG_DECODER 1 +%define CONFIG_MJPEGB_DECODER 1 +%define CONFIG_MMVIDEO_DECODER 1 +%define CONFIG_MOTIONPIXELS_DECODER 1 +%define CONFIG_MPEG_XVMC_DECODER 0 +%define CONFIG_MPEG1VIDEO_DECODER 1 +%define CONFIG_MPEG2VIDEO_DECODER 1 +%define CONFIG_MPEG4_DECODER 1 +%define CONFIG_MSA1_DECODER 1 +%define CONFIG_MSMPEG4V1_DECODER 1 +%define CONFIG_MSMPEG4V2_DECODER 1 +%define CONFIG_MSMPEG4V3_DECODER 1 +%define CONFIG_MSRLE_DECODER 1 +%define CONFIG_MSS1_DECODER 1 +%define CONFIG_MSS2_DECODER 1 +%define CONFIG_MSVIDEO1_DECODER 1 +%define CONFIG_MSZH_DECODER 1 +%define CONFIG_MTS2_DECODER 1 +%define CONFIG_MVC1_DECODER 1 +%define CONFIG_MVC2_DECODER 1 +%define CONFIG_MXPEG_DECODER 1 +%define CONFIG_NUV_DECODER 1 +%define CONFIG_PAF_VIDEO_DECODER 1 +%define CONFIG_PAM_DECODER 1 +%define CONFIG_PBM_DECODER 1 +%define CONFIG_PCX_DECODER 1 +%define CONFIG_PGM_DECODER 1 +%define CONFIG_PGMYUV_DECODER 1 +%define CONFIG_PICTOR_DECODER 1 +%define CONFIG_PNG_DECODER 1 +%define CONFIG_PPM_DECODER 1 +%define CONFIG_PRORES_DECODER 1 +%define CONFIG_PTX_DECODER 1 +%define CONFIG_QDRAW_DECODER 1 +%define CONFIG_QPEG_DECODER 1 +%define CONFIG_QTRLE_DECODER 1 +%define CONFIG_R10K_DECODER 1 +%define CONFIG_R210_DECODER 1 +%define CONFIG_RAWVIDEO_DECODER 1 +%define CONFIG_RL2_DECODER 1 +%define CONFIG_ROQ_DECODER 1 +%define CONFIG_RPZA_DECODER 1 +%define CONFIG_RV10_DECODER 1 +%define CONFIG_RV20_DECODER 1 +%define CONFIG_RV30_DECODER 1 +%define CONFIG_RV40_DECODER 1 +%define CONFIG_S302M_DECODER 1 +%define CONFIG_SANM_DECODER 1 +%define CONFIG_SGI_DECODER 1 +%define CONFIG_SGIRLE_DECODER 1 +%define CONFIG_SMACKER_DECODER 1 +%define CONFIG_SMC_DECODER 1 +%define CONFIG_SP5X_DECODER 1 +%define CONFIG_SUNRAST_DECODER 1 +%define CONFIG_SVQ1_DECODER 1 +%define CONFIG_SVQ3_DECODER 1 +%define CONFIG_TARGA_DECODER 1 +%define CONFIG_THEORA_DECODER 1 +%define CONFIG_THP_DECODER 1 +%define CONFIG_TIERTEXSEQVIDEO_DECODER 1 +%define CONFIG_TIFF_DECODER 1 +%define CONFIG_TMV_DECODER 1 +%define CONFIG_TRUEMOTION1_DECODER 1 +%define CONFIG_TRUEMOTION2_DECODER 1 +%define CONFIG_TSCC_DECODER 1 +%define CONFIG_TSCC2_DECODER 1 +%define CONFIG_TXD_DECODER 1 +%define CONFIG_ULTI_DECODER 1 +%define CONFIG_UTVIDEO_DECODER 1 +%define CONFIG_V210_DECODER 1 +%define CONFIG_V210X_DECODER 1 +%define CONFIG_V410_DECODER 1 +%define CONFIG_VB_DECODER 1 +%define CONFIG_VBLE_DECODER 1 +%define CONFIG_VC1_DECODER 1 +%define CONFIG_VC1IMAGE_DECODER 1 +%define CONFIG_VCR1_DECODER 1 +%define CONFIG_VMDVIDEO_DECODER 1 +%define CONFIG_VMNC_DECODER 1 +%define CONFIG_VP3_DECODER 1 +%define CONFIG_VP5_DECODER 1 +%define CONFIG_VP6_DECODER 1 +%define CONFIG_VP6A_DECODER 1 +%define CONFIG_VP6F_DECODER 1 +%define CONFIG_VP7_DECODER 1 +%define CONFIG_VP8_DECODER 1 +%define CONFIG_VP9_DECODER 1 +%define CONFIG_VQA_DECODER 1 +%define CONFIG_WEBP_DECODER 1 +%define CONFIG_WMV1_DECODER 1 +%define CONFIG_WMV2_DECODER 1 +%define CONFIG_WMV3_DECODER 1 +%define CONFIG_WMV3IMAGE_DECODER 1 +%define CONFIG_WNV1_DECODER 1 +%define CONFIG_XAN_WC3_DECODER 1 +%define CONFIG_XAN_WC4_DECODER 1 +%define CONFIG_XBM_DECODER 1 +%define CONFIG_XL_DECODER 1 +%define CONFIG_XWD_DECODER 1 +%define CONFIG_YOP_DECODER 1 +%define CONFIG_ZEROCODEC_DECODER 1 +%define CONFIG_ZLIB_DECODER 1 +%define CONFIG_ZMBV_DECODER 1 +%define CONFIG_AAC_DECODER 1 +%define CONFIG_AAC_LATM_DECODER 1 +%define CONFIG_AC3_DECODER 1 +%define CONFIG_ALAC_DECODER 1 +%define CONFIG_ALS_DECODER 1 +%define CONFIG_AMRNB_DECODER 1 +%define CONFIG_AMRWB_DECODER 1 +%define CONFIG_APE_DECODER 1 +%define CONFIG_ATRAC1_DECODER 1 +%define CONFIG_ATRAC3_DECODER 1 +%define CONFIG_ATRAC3P_DECODER 1 +%define CONFIG_BINKAUDIO_DCT_DECODER 1 +%define CONFIG_BINKAUDIO_RDFT_DECODER 1 +%define CONFIG_BMV_AUDIO_DECODER 1 +%define CONFIG_COOK_DECODER 1 +%define CONFIG_DCA_DECODER 1 +%define CONFIG_DSICINAUDIO_DECODER 1 +%define CONFIG_EAC3_DECODER 1 +%define CONFIG_FLAC_DECODER 1 +%define CONFIG_G723_1_DECODER 1 +%define CONFIG_GSM_DECODER 1 +%define CONFIG_GSM_MS_DECODER 1 +%define CONFIG_IAC_DECODER 1 +%define CONFIG_IMC_DECODER 1 +%define CONFIG_MACE3_DECODER 1 +%define CONFIG_MACE6_DECODER 1 +%define CONFIG_METASOUND_DECODER 1 +%define CONFIG_MLP_DECODER 1 +%define CONFIG_MP1_DECODER 1 +%define CONFIG_MP1FLOAT_DECODER 1 +%define CONFIG_MP2_DECODER 1 +%define CONFIG_MP2FLOAT_DECODER 1 +%define CONFIG_MP3_DECODER 1 +%define CONFIG_MP3FLOAT_DECODER 1 +%define CONFIG_MP3ADU_DECODER 1 +%define CONFIG_MP3ADUFLOAT_DECODER 1 +%define CONFIG_MP3ON4_DECODER 1 +%define CONFIG_MP3ON4FLOAT_DECODER 1 +%define CONFIG_MPC7_DECODER 1 +%define CONFIG_MPC8_DECODER 1 +%define CONFIG_NELLYMOSER_DECODER 1 +%define CONFIG_ON2AVC_DECODER 1 +%define CONFIG_OPUS_DECODER 1 +%define CONFIG_PAF_AUDIO_DECODER 1 +%define CONFIG_QCELP_DECODER 1 +%define CONFIG_QDM2_DECODER 1 +%define CONFIG_RA_144_DECODER 1 +%define CONFIG_RA_288_DECODER 1 +%define CONFIG_RALF_DECODER 1 +%define CONFIG_SHORTEN_DECODER 1 +%define CONFIG_SIPR_DECODER 1 +%define CONFIG_SMACKAUD_DECODER 1 +%define CONFIG_TAK_DECODER 1 +%define CONFIG_TRUEHD_DECODER 1 +%define CONFIG_TRUESPEECH_DECODER 1 +%define CONFIG_TTA_DECODER 1 +%define CONFIG_TWINVQ_DECODER 1 +%define CONFIG_VMDAUDIO_DECODER 1 +%define CONFIG_VORBIS_DECODER 1 +%define CONFIG_WAVPACK_DECODER 1 +%define CONFIG_WMALOSSLESS_DECODER 1 +%define CONFIG_WMAPRO_DECODER 1 +%define CONFIG_WMAV1_DECODER 1 +%define CONFIG_WMAV2_DECODER 1 +%define CONFIG_WMAVOICE_DECODER 1 +%define CONFIG_WS_SND1_DECODER 1 +%define CONFIG_PCM_ALAW_DECODER 1 +%define CONFIG_PCM_BLURAY_DECODER 1 +%define CONFIG_PCM_DVD_DECODER 1 +%define CONFIG_PCM_F32BE_DECODER 1 +%define CONFIG_PCM_F32LE_DECODER 1 +%define CONFIG_PCM_F64BE_DECODER 1 +%define CONFIG_PCM_F64LE_DECODER 1 +%define CONFIG_PCM_LXF_DECODER 1 +%define CONFIG_PCM_MULAW_DECODER 1 +%define CONFIG_PCM_S8_DECODER 1 +%define CONFIG_PCM_S8_PLANAR_DECODER 1 +%define CONFIG_PCM_S16BE_DECODER 1 +%define CONFIG_PCM_S16LE_DECODER 1 +%define CONFIG_PCM_S16LE_PLANAR_DECODER 1 +%define CONFIG_PCM_S24BE_DECODER 1 +%define CONFIG_PCM_S24DAUD_DECODER 1 +%define CONFIG_PCM_S24LE_DECODER 1 +%define CONFIG_PCM_S24LE_PLANAR_DECODER 1 +%define CONFIG_PCM_S32BE_DECODER 1 +%define CONFIG_PCM_S32LE_DECODER 1 +%define CONFIG_PCM_S32LE_PLANAR_DECODER 1 +%define CONFIG_PCM_U8_DECODER 1 +%define CONFIG_PCM_U16BE_DECODER 1 +%define CONFIG_PCM_U16LE_DECODER 1 +%define CONFIG_PCM_U24BE_DECODER 1 +%define CONFIG_PCM_U24LE_DECODER 1 +%define CONFIG_PCM_U32BE_DECODER 1 +%define CONFIG_PCM_U32LE_DECODER 1 +%define CONFIG_PCM_ZORK_DECODER 1 +%define CONFIG_INTERPLAY_DPCM_DECODER 1 +%define CONFIG_ROQ_DPCM_DECODER 1 +%define CONFIG_SOL_DPCM_DECODER 1 +%define CONFIG_XAN_DPCM_DECODER 1 +%define CONFIG_ADPCM_4XM_DECODER 1 +%define CONFIG_ADPCM_ADX_DECODER 1 +%define CONFIG_ADPCM_CT_DECODER 1 +%define CONFIG_ADPCM_EA_DECODER 1 +%define CONFIG_ADPCM_EA_MAXIS_XA_DECODER 1 +%define CONFIG_ADPCM_EA_R1_DECODER 1 +%define CONFIG_ADPCM_EA_R2_DECODER 1 +%define CONFIG_ADPCM_EA_R3_DECODER 1 +%define CONFIG_ADPCM_EA_XAS_DECODER 1 +%define CONFIG_ADPCM_G722_DECODER 1 +%define CONFIG_ADPCM_G726_DECODER 1 +%define CONFIG_ADPCM_IMA_AMV_DECODER 1 +%define CONFIG_ADPCM_IMA_APC_DECODER 1 +%define CONFIG_ADPCM_IMA_DK3_DECODER 1 +%define CONFIG_ADPCM_IMA_DK4_DECODER 1 +%define CONFIG_ADPCM_IMA_EA_EACS_DECODER 1 +%define CONFIG_ADPCM_IMA_EA_SEAD_DECODER 1 +%define CONFIG_ADPCM_IMA_ISS_DECODER 1 +%define CONFIG_ADPCM_IMA_QT_DECODER 1 +%define CONFIG_ADPCM_IMA_SMJPEG_DECODER 1 +%define CONFIG_ADPCM_IMA_WAV_DECODER 1 +%define CONFIG_ADPCM_IMA_WS_DECODER 1 +%define CONFIG_ADPCM_MS_DECODER 1 +%define CONFIG_ADPCM_SBPRO_2_DECODER 1 +%define CONFIG_ADPCM_SBPRO_3_DECODER 1 +%define CONFIG_ADPCM_SBPRO_4_DECODER 1 +%define CONFIG_ADPCM_SWF_DECODER 1 +%define CONFIG_ADPCM_THP_DECODER 1 +%define CONFIG_ADPCM_VIMA_DECODER 1 +%define CONFIG_ADPCM_XA_DECODER 1 +%define CONFIG_ADPCM_YAMAHA_DECODER 1 +%define CONFIG_ASS_DECODER 1 +%define CONFIG_DVBSUB_DECODER 1 +%define CONFIG_DVDSUB_DECODER 1 +%define CONFIG_PGSSUB_DECODER 1 +%define CONFIG_SRT_DECODER 1 +%define CONFIG_XSUB_DECODER 1 +%define CONFIG_LIBFDK_AAC_DECODER 0 +%define CONFIG_LIBGSM_DECODER 0 +%define CONFIG_LIBGSM_MS_DECODER 0 +%define CONFIG_LIBILBC_DECODER 0 +%define CONFIG_LIBOPENCORE_AMRNB_DECODER 0 +%define CONFIG_LIBOPENCORE_AMRWB_DECODER 0 +%define CONFIG_LIBOPENJPEG_DECODER 0 +%define CONFIG_LIBOPUS_DECODER 0 +%define CONFIG_LIBSCHROEDINGER_DECODER 0 +%define CONFIG_LIBSPEEX_DECODER 0 +%define CONFIG_LIBVPX_VP8_DECODER 0 +%define CONFIG_LIBVPX_VP9_DECODER 0 +%define CONFIG_AAC_DEMUXER 1 +%define CONFIG_AC3_DEMUXER 1 +%define CONFIG_ADX_DEMUXER 1 +%define CONFIG_AEA_DEMUXER 1 +%define CONFIG_AIFF_DEMUXER 1 +%define CONFIG_AMR_DEMUXER 1 +%define CONFIG_ANM_DEMUXER 1 +%define CONFIG_APC_DEMUXER 1 +%define CONFIG_APE_DEMUXER 1 +%define CONFIG_ASF_DEMUXER 1 +%define CONFIG_ASS_DEMUXER 1 +%define CONFIG_AU_DEMUXER 1 +%define CONFIG_AVI_DEMUXER 1 +%define CONFIG_AVISYNTH_DEMUXER 0 +%define CONFIG_AVS_DEMUXER 1 +%define CONFIG_BETHSOFTVID_DEMUXER 1 +%define CONFIG_BFI_DEMUXER 1 +%define CONFIG_BINK_DEMUXER 1 +%define CONFIG_BMV_DEMUXER 1 +%define CONFIG_C93_DEMUXER 1 +%define CONFIG_CAF_DEMUXER 1 +%define CONFIG_CAVSVIDEO_DEMUXER 1 +%define CONFIG_CDG_DEMUXER 1 +%define CONFIG_CDXL_DEMUXER 1 +%define CONFIG_DAUD_DEMUXER 1 +%define CONFIG_DFA_DEMUXER 1 +%define CONFIG_DIRAC_DEMUXER 1 +%define CONFIG_DNXHD_DEMUXER 1 +%define CONFIG_DSICIN_DEMUXER 1 +%define CONFIG_DTS_DEMUXER 1 +%define CONFIG_DV_DEMUXER 1 +%define CONFIG_DXA_DEMUXER 1 +%define CONFIG_EA_DEMUXER 1 +%define CONFIG_EA_CDATA_DEMUXER 1 +%define CONFIG_EAC3_DEMUXER 1 +%define CONFIG_FFMETADATA_DEMUXER 1 +%define CONFIG_FILMSTRIP_DEMUXER 1 +%define CONFIG_FLAC_DEMUXER 1 +%define CONFIG_FLIC_DEMUXER 1 +%define CONFIG_FLV_DEMUXER 1 +%define CONFIG_FOURXM_DEMUXER 1 +%define CONFIG_G722_DEMUXER 1 +%define CONFIG_G723_1_DEMUXER 1 +%define CONFIG_GSM_DEMUXER 1 +%define CONFIG_GXF_DEMUXER 1 +%define CONFIG_H261_DEMUXER 1 +%define CONFIG_H263_DEMUXER 1 +%define CONFIG_H264_DEMUXER 1 +%define CONFIG_HEVC_DEMUXER 1 +%define CONFIG_HLS_DEMUXER 1 +%define CONFIG_HNM_DEMUXER 1 +%define CONFIG_IDCIN_DEMUXER 1 +%define CONFIG_IFF_DEMUXER 1 +%define CONFIG_ILBC_DEMUXER 1 +%define CONFIG_IMAGE2_DEMUXER 1 +%define CONFIG_IMAGE2PIPE_DEMUXER 1 +%define CONFIG_INGENIENT_DEMUXER 1 +%define CONFIG_IPMOVIE_DEMUXER 1 +%define CONFIG_ISS_DEMUXER 1 +%define CONFIG_IV8_DEMUXER 1 +%define CONFIG_IVF_DEMUXER 1 +%define CONFIG_JV_DEMUXER 1 +%define CONFIG_LATM_DEMUXER 1 +%define CONFIG_LMLM4_DEMUXER 1 +%define CONFIG_LXF_DEMUXER 1 +%define CONFIG_M4V_DEMUXER 1 +%define CONFIG_MATROSKA_DEMUXER 1 +%define CONFIG_MJPEG_DEMUXER 1 +%define CONFIG_MLP_DEMUXER 1 +%define CONFIG_MM_DEMUXER 1 +%define CONFIG_MMF_DEMUXER 1 +%define CONFIG_MOV_DEMUXER 1 +%define CONFIG_MP3_DEMUXER 1 +%define CONFIG_MPC_DEMUXER 1 +%define CONFIG_MPC8_DEMUXER 1 +%define CONFIG_MPEGPS_DEMUXER 1 +%define CONFIG_MPEGTS_DEMUXER 1 +%define CONFIG_MPEGTSRAW_DEMUXER 1 +%define CONFIG_MPEGVIDEO_DEMUXER 1 +%define CONFIG_MSNWC_TCP_DEMUXER 1 +%define CONFIG_MTV_DEMUXER 1 +%define CONFIG_MV_DEMUXER 1 +%define CONFIG_MVI_DEMUXER 1 +%define CONFIG_MXF_DEMUXER 1 +%define CONFIG_MXG_DEMUXER 1 +%define CONFIG_NC_DEMUXER 1 +%define CONFIG_NSV_DEMUXER 1 +%define CONFIG_NUT_DEMUXER 1 +%define CONFIG_NUV_DEMUXER 1 +%define CONFIG_OGG_DEMUXER 1 +%define CONFIG_OMA_DEMUXER 1 +%define CONFIG_PAF_DEMUXER 1 +%define CONFIG_PCM_ALAW_DEMUXER 1 +%define CONFIG_PCM_MULAW_DEMUXER 1 +%define CONFIG_PCM_F64BE_DEMUXER 1 +%define CONFIG_PCM_F64LE_DEMUXER 1 +%define CONFIG_PCM_F32BE_DEMUXER 1 +%define CONFIG_PCM_F32LE_DEMUXER 1 +%define CONFIG_PCM_S32BE_DEMUXER 1 +%define CONFIG_PCM_S32LE_DEMUXER 1 +%define CONFIG_PCM_S24BE_DEMUXER 1 +%define CONFIG_PCM_S24LE_DEMUXER 1 +%define CONFIG_PCM_S16BE_DEMUXER 1 +%define CONFIG_PCM_S16LE_DEMUXER 1 +%define CONFIG_PCM_S8_DEMUXER 1 +%define CONFIG_PCM_U32BE_DEMUXER 1 +%define CONFIG_PCM_U32LE_DEMUXER 1 +%define CONFIG_PCM_U24BE_DEMUXER 1 +%define CONFIG_PCM_U24LE_DEMUXER 1 +%define CONFIG_PCM_U16BE_DEMUXER 1 +%define CONFIG_PCM_U16LE_DEMUXER 1 +%define CONFIG_PCM_U8_DEMUXER 1 +%define CONFIG_PMP_DEMUXER 1 +%define CONFIG_PVA_DEMUXER 1 +%define CONFIG_QCP_DEMUXER 1 +%define CONFIG_R3D_DEMUXER 1 +%define CONFIG_RAWVIDEO_DEMUXER 1 +%define CONFIG_RL2_DEMUXER 1 +%define CONFIG_RM_DEMUXER 1 +%define CONFIG_ROQ_DEMUXER 1 +%define CONFIG_RPL_DEMUXER 1 +%define CONFIG_RSO_DEMUXER 1 +%define CONFIG_RTP_DEMUXER 1 +%define CONFIG_RTSP_DEMUXER 1 +%define CONFIG_SAP_DEMUXER 1 +%define CONFIG_SDP_DEMUXER 1 +%define CONFIG_SEGAFILM_DEMUXER 1 +%define CONFIG_SHORTEN_DEMUXER 1 +%define CONFIG_SIFF_DEMUXER 1 +%define CONFIG_SMACKER_DEMUXER 1 +%define CONFIG_SMJPEG_DEMUXER 1 +%define CONFIG_SMUSH_DEMUXER 1 +%define CONFIG_SOL_DEMUXER 1 +%define CONFIG_SOX_DEMUXER 1 +%define CONFIG_SPDIF_DEMUXER 1 +%define CONFIG_SRT_DEMUXER 1 +%define CONFIG_STR_DEMUXER 1 +%define CONFIG_SWF_DEMUXER 1 +%define CONFIG_TAK_DEMUXER 1 +%define CONFIG_THP_DEMUXER 1 +%define CONFIG_TIERTEXSEQ_DEMUXER 1 +%define CONFIG_TMV_DEMUXER 1 +%define CONFIG_TRUEHD_DEMUXER 1 +%define CONFIG_TTA_DEMUXER 1 +%define CONFIG_TXD_DEMUXER 1 +%define CONFIG_TTY_DEMUXER 1 +%define CONFIG_VC1_DEMUXER 1 +%define CONFIG_VC1T_DEMUXER 1 +%define CONFIG_VMD_DEMUXER 1 +%define CONFIG_VOC_DEMUXER 1 +%define CONFIG_VQF_DEMUXER 1 +%define CONFIG_W64_DEMUXER 1 +%define CONFIG_WAV_DEMUXER 1 +%define CONFIG_WC3_DEMUXER 1 +%define CONFIG_WSAUD_DEMUXER 1 +%define CONFIG_WSVQA_DEMUXER 1 +%define CONFIG_WTV_DEMUXER 1 +%define CONFIG_WV_DEMUXER 1 +%define CONFIG_XA_DEMUXER 1 +%define CONFIG_XMV_DEMUXER 1 +%define CONFIG_XWMA_DEMUXER 1 +%define CONFIG_YOP_DEMUXER 1 +%define CONFIG_YUV4MPEGPIPE_DEMUXER 1 +%define CONFIG_A64MULTI_ENCODER 1 +%define CONFIG_A64MULTI5_ENCODER 1 +%define CONFIG_ALIAS_PIX_ENCODER 1 +%define CONFIG_ASV1_ENCODER 1 +%define CONFIG_ASV2_ENCODER 1 +%define CONFIG_BMP_ENCODER 1 +%define CONFIG_CLJR_ENCODER 1 +%define CONFIG_COMFORTNOISE_ENCODER 1 +%define CONFIG_DNXHD_ENCODER 1 +%define CONFIG_DPX_ENCODER 1 +%define CONFIG_DVVIDEO_ENCODER 1 +%define CONFIG_FFV1_ENCODER 1 +%define CONFIG_FFVHUFF_ENCODER 1 +%define CONFIG_FLASHSV_ENCODER 1 +%define CONFIG_FLV_ENCODER 1 +%define CONFIG_GIF_ENCODER 1 +%define CONFIG_H261_ENCODER 1 +%define CONFIG_H263_ENCODER 1 +%define CONFIG_H263P_ENCODER 1 +%define CONFIG_HUFFYUV_ENCODER 1 +%define CONFIG_JPEGLS_ENCODER 1 +%define CONFIG_LJPEG_ENCODER 1 +%define CONFIG_MJPEG_ENCODER 1 +%define CONFIG_MPEG1VIDEO_ENCODER 1 +%define CONFIG_MPEG2VIDEO_ENCODER 1 +%define CONFIG_MPEG4_ENCODER 1 +%define CONFIG_MSMPEG4V2_ENCODER 1 +%define CONFIG_MSMPEG4V3_ENCODER 1 +%define CONFIG_PAM_ENCODER 1 +%define CONFIG_PBM_ENCODER 1 +%define CONFIG_PCX_ENCODER 1 +%define CONFIG_PGM_ENCODER 1 +%define CONFIG_PGMYUV_ENCODER 1 +%define CONFIG_PNG_ENCODER 1 +%define CONFIG_PPM_ENCODER 1 +%define CONFIG_PRORES_ENCODER 1 +%define CONFIG_QTRLE_ENCODER 1 +%define CONFIG_RAWVIDEO_ENCODER 1 +%define CONFIG_ROQ_ENCODER 1 +%define CONFIG_RV10_ENCODER 1 +%define CONFIG_RV20_ENCODER 1 +%define CONFIG_SGI_ENCODER 1 +%define CONFIG_SUNRAST_ENCODER 1 +%define CONFIG_SVQ1_ENCODER 1 +%define CONFIG_TARGA_ENCODER 1 +%define CONFIG_LIBTWOLAME_ENCODER 0 +%define CONFIG_TIFF_ENCODER 1 +%define CONFIG_UTVIDEO_ENCODER 1 +%define CONFIG_V210_ENCODER 1 +%define CONFIG_V410_ENCODER 1 +%define CONFIG_WMV1_ENCODER 1 +%define CONFIG_WMV2_ENCODER 1 +%define CONFIG_XBM_ENCODER 1 +%define CONFIG_XWD_ENCODER 1 +%define CONFIG_ZLIB_ENCODER 1 +%define CONFIG_ZMBV_ENCODER 1 +%define CONFIG_AAC_ENCODER 1 +%define CONFIG_AC3_ENCODER 1 +%define CONFIG_AC3_FIXED_ENCODER 1 +%define CONFIG_ALAC_ENCODER 1 +%define CONFIG_EAC3_ENCODER 1 +%define CONFIG_FLAC_ENCODER 1 +%define CONFIG_MP2_ENCODER 1 +%define CONFIG_NELLYMOSER_ENCODER 1 +%define CONFIG_RA_144_ENCODER 1 +%define CONFIG_VORBIS_ENCODER 1 +%define CONFIG_WMAV1_ENCODER 1 +%define CONFIG_WMAV2_ENCODER 1 +%define CONFIG_PCM_ALAW_ENCODER 1 +%define CONFIG_PCM_F32BE_ENCODER 1 +%define CONFIG_PCM_F32LE_ENCODER 1 +%define CONFIG_PCM_F64BE_ENCODER 1 +%define CONFIG_PCM_F64LE_ENCODER 1 +%define CONFIG_PCM_MULAW_ENCODER 1 +%define CONFIG_PCM_S8_ENCODER 1 +%define CONFIG_PCM_S16BE_ENCODER 1 +%define CONFIG_PCM_S16LE_ENCODER 1 +%define CONFIG_PCM_S24BE_ENCODER 1 +%define CONFIG_PCM_S24DAUD_ENCODER 1 +%define CONFIG_PCM_S24LE_ENCODER 1 +%define CONFIG_PCM_S32BE_ENCODER 1 +%define CONFIG_PCM_S32LE_ENCODER 1 +%define CONFIG_PCM_U8_ENCODER 1 +%define CONFIG_PCM_U16BE_ENCODER 1 +%define CONFIG_PCM_U16LE_ENCODER 1 +%define CONFIG_PCM_U24BE_ENCODER 1 +%define CONFIG_PCM_U24LE_ENCODER 1 +%define CONFIG_PCM_U32BE_ENCODER 1 +%define CONFIG_PCM_U32LE_ENCODER 1 +%define CONFIG_ROQ_DPCM_ENCODER 1 +%define CONFIG_ADPCM_ADX_ENCODER 1 +%define CONFIG_ADPCM_G722_ENCODER 1 +%define CONFIG_ADPCM_G726_ENCODER 1 +%define CONFIG_ADPCM_IMA_QT_ENCODER 1 +%define CONFIG_ADPCM_IMA_WAV_ENCODER 1 +%define CONFIG_ADPCM_MS_ENCODER 1 +%define CONFIG_ADPCM_SWF_ENCODER 1 +%define CONFIG_ADPCM_YAMAHA_ENCODER 1 +%define CONFIG_ASS_ENCODER 1 +%define CONFIG_DVBSUB_ENCODER 1 +%define CONFIG_DVDSUB_ENCODER 1 +%define CONFIG_XSUB_ENCODER 1 +%define CONFIG_LIBFAAC_ENCODER 0 +%define CONFIG_LIBFDK_AAC_ENCODER 0 +%define CONFIG_LIBGSM_ENCODER 0 +%define CONFIG_LIBGSM_MS_ENCODER 0 +%define CONFIG_LIBILBC_ENCODER 0 +%define CONFIG_LIBMP3LAME_ENCODER 0 +%define CONFIG_LIBOPENCORE_AMRNB_ENCODER 0 +%define CONFIG_LIBOPENJPEG_ENCODER 0 +%define CONFIG_LIBOPUS_ENCODER 0 +%define CONFIG_LIBSCHROEDINGER_ENCODER 0 +%define CONFIG_LIBSPEEX_ENCODER 0 +%define CONFIG_LIBTHEORA_ENCODER 0 +%define CONFIG_LIBVO_AACENC_ENCODER 0 +%define CONFIG_LIBVO_AMRWBENC_ENCODER 0 +%define CONFIG_LIBVORBIS_ENCODER 0 +%define CONFIG_LIBVPX_VP8_ENCODER 0 +%define CONFIG_LIBVPX_VP9_ENCODER 0 +%define CONFIG_LIBWAVPACK_ENCODER 0 +%define CONFIG_LIBWEBP_ENCODER 0 +%define CONFIG_LIBX264_ENCODER 0 +%define CONFIG_LIBX265_ENCODER 0 +%define CONFIG_LIBXAVS_ENCODER 0 +%define CONFIG_LIBXVID_ENCODER 0 +%define CONFIG_AFORMAT_FILTER 1 +%define CONFIG_AMIX_FILTER 1 +%define CONFIG_ANULL_FILTER 1 +%define CONFIG_ASETPTS_FILTER 1 +%define CONFIG_ASETTB_FILTER 1 +%define CONFIG_ASHOWINFO_FILTER 1 +%define CONFIG_ASPLIT_FILTER 1 +%define CONFIG_ASYNCTS_FILTER 1 +%define CONFIG_ATRIM_FILTER 1 +%define CONFIG_BS2B_FILTER 0 +%define CONFIG_CHANNELMAP_FILTER 1 +%define CONFIG_CHANNELSPLIT_FILTER 1 +%define CONFIG_COMPAND_FILTER 1 +%define CONFIG_JOIN_FILTER 1 +%define CONFIG_RESAMPLE_FILTER 1 +%define CONFIG_VOLUME_FILTER 1 +%define CONFIG_ANULLSRC_FILTER 1 +%define CONFIG_ANULLSINK_FILTER 1 +%define CONFIG_BLACKFRAME_FILTER 0 +%define CONFIG_BOXBLUR_FILTER 0 +%define CONFIG_COPY_FILTER 1 +%define CONFIG_CROP_FILTER 1 +%define CONFIG_CROPDETECT_FILTER 0 +%define CONFIG_DELOGO_FILTER 0 +%define CONFIG_DRAWBOX_FILTER 1 +%define CONFIG_DRAWTEXT_FILTER 0 +%define CONFIG_FADE_FILTER 1 +%define CONFIG_FIELDORDER_FILTER 1 +%define CONFIG_FORMAT_FILTER 1 +%define CONFIG_FPS_FILTER 1 +%define CONFIG_FRAMEPACK_FILTER 1 +%define CONFIG_FREI0R_FILTER 0 +%define CONFIG_GRADFUN_FILTER 1 +%define CONFIG_HFLIP_FILTER 1 +%define CONFIG_HQDN3D_FILTER 0 +%define CONFIG_INTERLACE_FILTER 0 +%define CONFIG_LUT_FILTER 1 +%define CONFIG_LUTRGB_FILTER 1 +%define CONFIG_LUTYUV_FILTER 1 +%define CONFIG_NEGATE_FILTER 1 +%define CONFIG_NOFORMAT_FILTER 1 +%define CONFIG_NULL_FILTER 1 +%define CONFIG_OCV_FILTER 0 +%define CONFIG_OVERLAY_FILTER 1 +%define CONFIG_PAD_FILTER 1 +%define CONFIG_PIXDESCTEST_FILTER 1 +%define CONFIG_SCALE_FILTER 1 +%define CONFIG_SELECT_FILTER 1 +%define CONFIG_SETDAR_FILTER 1 +%define CONFIG_SETPTS_FILTER 1 +%define CONFIG_SETSAR_FILTER 1 +%define CONFIG_SETTB_FILTER 1 +%define CONFIG_SHOWINFO_FILTER 1 +%define CONFIG_SHUFFLEPLANES_FILTER 1 +%define CONFIG_SPLIT_FILTER 1 +%define CONFIG_TRANSPOSE_FILTER 1 +%define CONFIG_TRIM_FILTER 1 +%define CONFIG_UNSHARP_FILTER 1 +%define CONFIG_VFLIP_FILTER 1 +%define CONFIG_YADIF_FILTER 1 +%define CONFIG_COLOR_FILTER 1 +%define CONFIG_FREI0R_SRC_FILTER 0 +%define CONFIG_MOVIE_FILTER 1 +%define CONFIG_NULLSRC_FILTER 1 +%define CONFIG_RGBTESTSRC_FILTER 1 +%define CONFIG_TESTSRC_FILTER 1 +%define CONFIG_NULLSINK_FILTER 1 +%define CONFIG_H263_VAAPI_HWACCEL 0 +%define CONFIG_H263_VDPAU_HWACCEL 0 +%define CONFIG_H264_DXVA2_HWACCEL 0 +%define CONFIG_H264_VAAPI_HWACCEL 0 +%define CONFIG_H264_VDA_HWACCEL 0 +%define CONFIG_H264_VDA_OLD_HWACCEL 0 +%define CONFIG_H264_VDPAU_HWACCEL 0 +%define CONFIG_MPEG1_VDPAU_HWACCEL 0 +%define CONFIG_MPEG2_DXVA2_HWACCEL 0 +%define CONFIG_MPEG2_VAAPI_HWACCEL 0 +%define CONFIG_MPEG2_VDPAU_HWACCEL 0 +%define CONFIG_MPEG4_VAAPI_HWACCEL 0 +%define CONFIG_MPEG4_VDPAU_HWACCEL 0 +%define CONFIG_VC1_DXVA2_HWACCEL 0 +%define CONFIG_VC1_VAAPI_HWACCEL 0 +%define CONFIG_VC1_VDPAU_HWACCEL 0 +%define CONFIG_WMV3_DXVA2_HWACCEL 0 +%define CONFIG_WMV3_VAAPI_HWACCEL 0 +%define CONFIG_WMV3_VDPAU_HWACCEL 0 +%define CONFIG_ALSA_INDEV 1 +%define CONFIG_BKTR_INDEV 0 +%define CONFIG_DV1394_INDEV 1 +%define CONFIG_FBDEV_INDEV 1 +%define CONFIG_JACK_INDEV 0 +%define CONFIG_OSS_INDEV 1 +%define CONFIG_PULSE_INDEV 0 +%define CONFIG_SNDIO_INDEV 0 +%define CONFIG_V4L2_INDEV 1 +%define CONFIG_VFWCAP_INDEV 0 +%define CONFIG_X11GRAB_INDEV 0 +%define CONFIG_LIBCDIO_INDEV 0 +%define CONFIG_LIBDC1394_INDEV 0 +%define CONFIG_A64_MUXER 1 +%define CONFIG_AC3_MUXER 1 +%define CONFIG_ADTS_MUXER 1 +%define CONFIG_ADX_MUXER 1 +%define CONFIG_AIFF_MUXER 1 +%define CONFIG_AMR_MUXER 1 +%define CONFIG_ASF_MUXER 1 +%define CONFIG_ASS_MUXER 1 +%define CONFIG_ASF_STREAM_MUXER 1 +%define CONFIG_AU_MUXER 1 +%define CONFIG_AVI_MUXER 1 +%define CONFIG_AVM2_MUXER 1 +%define CONFIG_CAVSVIDEO_MUXER 1 +%define CONFIG_CRC_MUXER 1 +%define CONFIG_DAUD_MUXER 1 +%define CONFIG_DIRAC_MUXER 1 +%define CONFIG_DNXHD_MUXER 1 +%define CONFIG_DTS_MUXER 1 +%define CONFIG_DV_MUXER 1 +%define CONFIG_EAC3_MUXER 1 +%define CONFIG_F4V_MUXER 1 +%define CONFIG_FFMETADATA_MUXER 1 +%define CONFIG_FILMSTRIP_MUXER 1 +%define CONFIG_FLAC_MUXER 1 +%define CONFIG_FLV_MUXER 1 +%define CONFIG_FRAMECRC_MUXER 1 +%define CONFIG_FRAMEMD5_MUXER 1 +%define CONFIG_G722_MUXER 1 +%define CONFIG_GIF_MUXER 1 +%define CONFIG_GXF_MUXER 1 +%define CONFIG_H261_MUXER 1 +%define CONFIG_H263_MUXER 1 +%define CONFIG_H264_MUXER 1 +%define CONFIG_HDS_MUXER 1 +%define CONFIG_HEVC_MUXER 1 +%define CONFIG_HLS_MUXER 1 +%define CONFIG_ILBC_MUXER 1 +%define CONFIG_IMAGE2_MUXER 1 +%define CONFIG_IMAGE2PIPE_MUXER 1 +%define CONFIG_IPOD_MUXER 1 +%define CONFIG_ISMV_MUXER 1 +%define CONFIG_IVF_MUXER 1 +%define CONFIG_LATM_MUXER 1 +%define CONFIG_M4V_MUXER 1 +%define CONFIG_MD5_MUXER 1 +%define CONFIG_MATROSKA_MUXER 1 +%define CONFIG_MATROSKA_AUDIO_MUXER 1 +%define CONFIG_MJPEG_MUXER 1 +%define CONFIG_MLP_MUXER 1 +%define CONFIG_MMF_MUXER 1 +%define CONFIG_MOV_MUXER 1 +%define CONFIG_MP2_MUXER 1 +%define CONFIG_MP3_MUXER 1 +%define CONFIG_MP4_MUXER 1 +%define CONFIG_MPEG1SYSTEM_MUXER 1 +%define CONFIG_MPEG1VCD_MUXER 1 +%define CONFIG_MPEG1VIDEO_MUXER 1 +%define CONFIG_MPEG2DVD_MUXER 1 +%define CONFIG_MPEG2SVCD_MUXER 1 +%define CONFIG_MPEG2VIDEO_MUXER 1 +%define CONFIG_MPEG2VOB_MUXER 1 +%define CONFIG_MPEGTS_MUXER 1 +%define CONFIG_MPJPEG_MUXER 1 +%define CONFIG_MXF_MUXER 1 +%define CONFIG_MXF_D10_MUXER 1 +%define CONFIG_NULL_MUXER 1 +%define CONFIG_NUT_MUXER 1 +%define CONFIG_OGG_MUXER 1 +%define CONFIG_OMA_MUXER 1 +%define CONFIG_PCM_ALAW_MUXER 1 +%define CONFIG_PCM_MULAW_MUXER 1 +%define CONFIG_PCM_F64BE_MUXER 1 +%define CONFIG_PCM_F64LE_MUXER 1 +%define CONFIG_PCM_F32BE_MUXER 1 +%define CONFIG_PCM_F32LE_MUXER 1 +%define CONFIG_PCM_S32BE_MUXER 1 +%define CONFIG_PCM_S32LE_MUXER 1 +%define CONFIG_PCM_S24BE_MUXER 1 +%define CONFIG_PCM_S24LE_MUXER 1 +%define CONFIG_PCM_S16BE_MUXER 1 +%define CONFIG_PCM_S16LE_MUXER 1 +%define CONFIG_PCM_S8_MUXER 1 +%define CONFIG_PCM_U32BE_MUXER 1 +%define CONFIG_PCM_U32LE_MUXER 1 +%define CONFIG_PCM_U24BE_MUXER 1 +%define CONFIG_PCM_U24LE_MUXER 1 +%define CONFIG_PCM_U16BE_MUXER 1 +%define CONFIG_PCM_U16LE_MUXER 1 +%define CONFIG_PCM_U8_MUXER 1 +%define CONFIG_PSP_MUXER 1 +%define CONFIG_RAWVIDEO_MUXER 1 +%define CONFIG_RM_MUXER 1 +%define CONFIG_ROQ_MUXER 1 +%define CONFIG_RSO_MUXER 1 +%define CONFIG_RTP_MUXER 1 +%define CONFIG_RTSP_MUXER 1 +%define CONFIG_SAP_MUXER 1 +%define CONFIG_SEGMENT_MUXER 1 +%define CONFIG_SMJPEG_MUXER 1 +%define CONFIG_SMOOTHSTREAMING_MUXER 1 +%define CONFIG_SOX_MUXER 1 +%define CONFIG_SPDIF_MUXER 1 +%define CONFIG_SRT_MUXER 1 +%define CONFIG_SWF_MUXER 1 +%define CONFIG_TG2_MUXER 1 +%define CONFIG_TGP_MUXER 1 +%define CONFIG_TRUEHD_MUXER 1 +%define CONFIG_VC1T_MUXER 1 +%define CONFIG_VOC_MUXER 1 +%define CONFIG_WAV_MUXER 1 +%define CONFIG_WEBM_MUXER 1 +%define CONFIG_WV_MUXER 1 +%define CONFIG_YUV4MPEGPIPE_MUXER 1 +%define CONFIG_ALSA_OUTDEV 1 +%define CONFIG_OSS_OUTDEV 1 +%define CONFIG_SNDIO_OUTDEV 0 +%define CONFIG_AAC_PARSER 1 +%define CONFIG_AAC_LATM_PARSER 1 +%define CONFIG_AC3_PARSER 1 +%define CONFIG_ADX_PARSER 1 +%define CONFIG_BMP_PARSER 1 +%define CONFIG_CAVSVIDEO_PARSER 1 +%define CONFIG_COOK_PARSER 1 +%define CONFIG_DCA_PARSER 1 +%define CONFIG_DIRAC_PARSER 1 +%define CONFIG_DNXHD_PARSER 1 +%define CONFIG_DVBSUB_PARSER 1 +%define CONFIG_DVDSUB_PARSER 1 +%define CONFIG_FLAC_PARSER 1 +%define CONFIG_GSM_PARSER 1 +%define CONFIG_H261_PARSER 1 +%define CONFIG_H263_PARSER 1 +%define CONFIG_H264_PARSER 1 +%define CONFIG_HEVC_PARSER 1 +%define CONFIG_MJPEG_PARSER 1 +%define CONFIG_MLP_PARSER 1 +%define CONFIG_MPEG4VIDEO_PARSER 1 +%define CONFIG_MPEGAUDIO_PARSER 1 +%define CONFIG_MPEGVIDEO_PARSER 1 +%define CONFIG_OPUS_PARSER 1 +%define CONFIG_PNG_PARSER 1 +%define CONFIG_PNM_PARSER 1 +%define CONFIG_RV30_PARSER 1 +%define CONFIG_RV40_PARSER 1 +%define CONFIG_TAK_PARSER 1 +%define CONFIG_VC1_PARSER 1 +%define CONFIG_VORBIS_PARSER 1 +%define CONFIG_VP3_PARSER 1 +%define CONFIG_VP8_PARSER 1 +%define CONFIG_CONCAT_PROTOCOL 1 +%define CONFIG_CRYPTO_PROTOCOL 1 +%define CONFIG_FFRTMPCRYPT_PROTOCOL 0 +%define CONFIG_FFRTMPHTTP_PROTOCOL 1 +%define CONFIG_FILE_PROTOCOL 1 +%define CONFIG_GOPHER_PROTOCOL 1 +%define CONFIG_HLS_PROTOCOL 1 +%define CONFIG_HTTP_PROTOCOL 1 +%define CONFIG_HTTPPROXY_PROTOCOL 1 +%define CONFIG_HTTPS_PROTOCOL 0 +%define CONFIG_MMSH_PROTOCOL 1 +%define CONFIG_MMST_PROTOCOL 1 +%define CONFIG_MD5_PROTOCOL 1 +%define CONFIG_PIPE_PROTOCOL 1 +%define CONFIG_RTMP_PROTOCOL 1 +%define CONFIG_RTMPE_PROTOCOL 0 +%define CONFIG_RTMPS_PROTOCOL 0 +%define CONFIG_RTMPT_PROTOCOL 1 +%define CONFIG_RTMPTE_PROTOCOL 0 +%define CONFIG_RTMPTS_PROTOCOL 0 +%define CONFIG_RTP_PROTOCOL 1 +%define CONFIG_SCTP_PROTOCOL 0 +%define CONFIG_SRTP_PROTOCOL 1 +%define CONFIG_TCP_PROTOCOL 1 +%define CONFIG_TLS_PROTOCOL 0 +%define CONFIG_UDP_PROTOCOL 1 +%define CONFIG_UNIX_PROTOCOL 1 +%define CONFIG_LIBRTMP_PROTOCOL 0 +%define CONFIG_LIBRTMPE_PROTOCOL 0 +%define CONFIG_LIBRTMPS_PROTOCOL 0 +%define CONFIG_LIBRTMPT_PROTOCOL 0 +%define CONFIG_LIBRTMPTE_PROTOCOL 0 diff --git a/config.h b/config.h new file mode 100644 index 0000000..ffd98d8 --- /dev/null +++ b/config.h @@ -0,0 +1,12 @@ +#define HAVE_AVX 1 +#define HAVE_SSE 1 +#define HAVE_SSE3_EXTERNAL 1 +#define HAVE_AVX_EXTERNAL 1 +#define HAVE_FMA3_EXTERNAL 1 +#define ARCH_X86_64 1 +#define ARCH_X86 1 +#define HAVE_SCHED_GETAFFINITY 1 +#define HAVE_SYSCTL 1 +#define HAVE_OPENCL 0 +#define TD_VERIFY 0 +#define TD_POLAR 1 @@ -0,0 +1,220 @@ +/* + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#include "config.h" + +#if HAVE_SCHED_GETAFFINITY +#define _GNU_SOURCE +#include <sched.h> +#endif +#if HAVE_GETPROCESSAFFINITYMASK +#include <windows.h> +#endif +#if HAVE_SYSCTL +#if HAVE_SYS_PARAM_H +#include <sys/param.h> +#endif +#include <sys/types.h> +#include <sys/sysctl.h> +#endif +#if HAVE_SYSCONF +#include <unistd.h> +#endif + +#include <string.h> + +#include "cpu.h" + +#if ARCH_X86 +static int get_cpu_flags_x86(void) +{ + int rval = 0; + + int eax, ebx, ecx, edx; + int max_std_level, max_ext_level, std_caps = 0, ext_caps = 0; + int family = 0, model = 0; + union { int i[3]; char c[12]; } vendor; + + tdi_cpu_cpuid(0, &max_std_level, &vendor.i[0], &vendor.i[2], &vendor.i[1]); + + if (max_std_level >= 1) { + tdi_cpu_cpuid(1, &eax, &ebx, &ecx, &std_caps); + family = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff); + model = ((eax >> 4) & 0xf) + ((eax >> 12) & 0xf0); + if (std_caps & (1 << 15)) + rval |= TDI_CPU_FLAG_CMOV; + if (std_caps & (1 << 23)) + rval |= TDI_CPU_FLAG_MMX; + if (std_caps & (1 << 25)) + rval |= TDI_CPU_FLAG_MMXEXT; +#if HAVE_SSE + if (std_caps & (1 << 25)) + rval |= TDI_CPU_FLAG_SSE; + if (std_caps & (1 << 26)) + rval |= TDI_CPU_FLAG_SSE2; + if (ecx & 1) + rval |= TDI_CPU_FLAG_SSE3; + if (ecx & 0x00000200 ) + rval |= TDI_CPU_FLAG_SSSE3; + if (ecx & 0x00080000 ) + rval |= TDI_CPU_FLAG_SSE4; + if (ecx & 0x00100000 ) + rval |= TDI_CPU_FLAG_SSE42; +#if HAVE_AVX + /* Check OXSAVE and AVX bits */ + if ((ecx & 0x18000000) == 0x18000000) { + /* Check for OS support */ + tdi_cpu_xgetbv(0, &eax, &edx); + if ((eax & 0x6) == 0x6) { + rval |= TDI_CPU_FLAG_AVX; + if (ecx & 0x00001000) + rval |= TDI_CPU_FLAG_FMA3; + } + } +#endif /* HAVE_AVX */ +#endif /* HAVE_SSE */ + } + if (max_std_level >= 7) { + tdi_cpu_cpuid(7, &eax, &ebx, &ecx, &edx); +#if HAVE_AVX2 + if (ebx & 0x00000020) + rval |= TDI_CPU_FLAG_AVX2; +#endif /* HAVE_AVX2 */ + /* BMI1/2 don't need OS support */ + if (ebx & 0x00000008) { + rval |= TDI_CPU_FLAG_BMI1; + if (ebx & 0x00000100) + rval |= TDI_CPU_FLAG_BMI2; + } + } + + tdi_cpu_cpuid(0x80000000, &max_ext_level, &ebx, &ecx, &edx); + + if (max_ext_level >= 0x80000001) { + tdi_cpu_cpuid(0x80000001, &eax, &ebx, &ecx, &ext_caps); + if (ext_caps & (1U << 31)) + rval |= TDI_CPU_FLAG_3DNOW; + if (ext_caps & (1 << 30)) + rval |= TDI_CPU_FLAG_3DNOWEXT; + if (ext_caps & (1 << 23)) + rval |= TDI_CPU_FLAG_MMX; + if (ext_caps & (1 << 22)) + rval |= TDI_CPU_FLAG_MMXEXT; + + if (!strncmp(vendor.c, "AuthenticAMD", 12)) { + /* Allow for selectively disabling SSE2 functions on AMD processors + with SSE2 support but not SSE4a. This includes Athlon64, some + Opteron, and some Sempron processors. MMX, SSE, or 3DNow! are faster + than SSE2 often enough to utilize this special-case flag. + TDI_CPU_FLAG_SSE2 and TDI_CPU_FLAG_SSE2SLOW are both set in this case + so that SSE2 is used unless explicitly disabled by checking + TDI_CPU_FLAG_SSE2SLOW. */ + if (rval & TDI_CPU_FLAG_SSE2 && !(ecx & 0x00000040)) + rval |= TDI_CPU_FLAG_SSE2SLOW; + + /* Similar to the above but for AVX functions on AMD processors. + This is necessary only for functions using YMM registers on Bulldozer + based CPUs as they lack 256-bit execution units. SSE/AVX functions + using XMM registers are always faster on them. + TDI_CPU_FLAG_AVX and TDI_CPU_FLAG_AVXSLOW are both set so that AVX is + used unless explicitly disabled by checking TDI_CPU_FLAG_AVXSLOW. + TODO: Confirm if Excavator is affected or not by this once it's + released, and update the check if necessary. Same for btver2. */ + if (family == 0x15 && (rval & TDI_CPU_FLAG_AVX)) + rval |= TDI_CPU_FLAG_AVXSLOW; + } + + /* XOP and FMA4 use the AVX instruction coding scheme, so they can't be + * used unless the OS has AVX support. */ + if (rval & TDI_CPU_FLAG_AVX) { + if (ecx & 0x00000800) + rval |= TDI_CPU_FLAG_XOP; + if (ecx & 0x00010000) + rval |= TDI_CPU_FLAG_FMA4; + } + } + + if (!strncmp(vendor.c, "GenuineIntel", 12)) { + if (family == 6 && (model == 9 || model == 13 || model == 14)) { + /* 6/9 (pentium-m "banias"), 6/13 (pentium-m "dothan"), and + * 6/14 (core1 "yonah") theoretically support sse2, but it's + * usually slower than mmx, so let's just pretend they don't. + * TDI_CPU_FLAG_SSE2 is disabled and TDI_CPU_FLAG_SSE2SLOW is + * enabled so that SSE2 is not used unless explicitly enabled + * by checking TDI_CPU_FLAG_SSE2SLOW. The same situation + * applies for TDI_CPU_FLAG_SSE3 and TDI_CPU_FLAG_SSE3SLOW. */ + if (rval & TDI_CPU_FLAG_SSE2) + rval ^= TDI_CPU_FLAG_SSE2SLOW | TDI_CPU_FLAG_SSE2; + if (rval & TDI_CPU_FLAG_SSE3) + rval ^= TDI_CPU_FLAG_SSE3SLOW | TDI_CPU_FLAG_SSE3; + } + /* The Atom processor has SSSE3 support, which is useful in many cases, + * but sometimes the SSSE3 version is slower than the SSE2 equivalent + * on the Atom, but is generally faster on other processors supporting + * SSSE3. This flag allows for selectively disabling certain SSSE3 + * functions on the Atom. */ + if (family == 6 && model == 28) + rval |= TDI_CPU_FLAG_ATOM; + + /* Conroe has a slow shuffle unit. Check the model number to ensure not + * to include crippled low-end Penryns and Nehalems that lack SSE4. */ + if ((rval & TDI_CPU_FLAG_SSSE3) && !(rval & TDI_CPU_FLAG_SSE4) && + family == 6 && model < 23) + rval |= TDI_CPU_FLAG_SSSE3SLOW; + } + + return rval; +} +#endif + +int tdi_init_cpu_flags(void) +{ + int flags = 0; + +#if ARCH_X86 + flags = get_cpu_flags_x86(); +#endif + + return flags; +} + +unsigned int tdi_cpu_count(void) +{ + unsigned int nb_cpus = 1; +#if HAVE_SCHED_GETAFFINITY && defined(CPU_COUNT) + cpu_set_t cpuset; + + CPU_ZERO(&cpuset); + + if (!sched_getaffinity(0, sizeof(cpuset), &cpuset)) + nb_cpus = CPU_COUNT(&cpuset); +#elif HAVE_GETPROCESSAFFINITYMASK + DWORD_PTR proc_aff, sys_aff; + if (GetProcessAffinityMask(GetCurrentProcess(), &proc_aff, &sys_aff)) + nb_cpus = av_popcount64(proc_aff); +#elif HAVE_SYSCTL && defined(HW_NCPU) + int mib[2] = { CTL_HW, HW_NCPU }; + size_t len = sizeof(nb_cpus); + + if (sysctl(mib, 2, &nb_cpus, &len, NULL, 0) == -1) + nb_cpus = 0; +#elif HAVE_SYSCONF && defined(_SC_NPROC_ONLN) + nb_cpus = sysconf(_SC_NPROC_ONLN); +#elif HAVE_SYSCONF && defined(_SC_NPROCESSORS_ONLN) + nb_cpus = sysconf(_SC_NPROCESSORS_ONLN); +#endif + + return nb_cpus; +} @@ -0,0 +1,130 @@ +/* + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef TEUKOLSKY_DATA_CPU_H +#define TEUKOLSKY_DATA_CPU_H + +#include "config.h" + +#define TDI_CPU_FLAG_MMX 0x0001 ///< standard MMX +#define TDI_CPU_FLAG_MMXEXT 0x0002 ///< SSE integer functions or AMD MMX ext +#define TDI_CPU_FLAG_3DNOW 0x0004 ///< AMD 3DNOW +#define TDI_CPU_FLAG_SSE 0x0008 ///< SSE functions +#define TDI_CPU_FLAG_SSE2 0x0010 ///< PIV SSE2 functions +#define TDI_CPU_FLAG_SSE2SLOW 0x40000000 ///< SSE2 supported, but usually not faster + ///< than regular MMX/SSE (e.g. Core1) +#define TDI_CPU_FLAG_3DNOWEXT 0x0020 ///< AMD 3DNowExt +#define TDI_CPU_FLAG_SSE3 0x0040 ///< Prescott SSE3 functions +#define TDI_CPU_FLAG_SSE3SLOW 0x20000000 ///< SSE3 supported, but usually not faster + ///< than regular MMX/SSE (e.g. Core1) +#define TDI_CPU_FLAG_SSSE3 0x0080 ///< Conroe SSSE3 functions +#define TDI_CPU_FLAG_SSSE3SLOW 0x4000000 ///< SSSE3 supported, but usually not faster +#define TDI_CPU_FLAG_ATOM 0x10000000 ///< Atom processor, some SSSE3 instructions are slower +#define TDI_CPU_FLAG_SSE4 0x0100 ///< Penryn SSE4.1 functions +#define TDI_CPU_FLAG_SSE42 0x0200 ///< Nehalem SSE4.2 functions +#define TDI_CPU_FLAG_AVX 0x4000 ///< AVX functions: requires OS support even if YMM registers aren't used +#define TDI_CPU_FLAG_AVXSLOW 0x8000000 ///< AVX supported, but slow when using YMM registers (e.g. Bulldozer) +#define TDI_CPU_FLAG_XOP 0x0400 ///< Bulldozer XOP functions +#define TDI_CPU_FLAG_FMA4 0x0800 ///< Bulldozer FMA4 functions +#define TDI_CPU_FLAG_CMOV 0x1000 ///< i686 cmov +#define TDI_CPU_FLAG_AVX2 0x8000 ///< AVX2 functions: requires OS support even if YMM registers aren't used +#define TDI_CPU_FLAG_FMA3 0x10000 ///< Haswell FMA3 functions +#define TDI_CPU_FLAG_BMI1 0x20000 ///< Bit Manipulation Instruction Set 1 +#define TDI_CPU_FLAG_BMI2 0x40000 ///< Bit Manipulation Instruction Set 2 + +#define TDI_CPU_FLAG_ALTIVEC 0x0001 ///< standard +#define TDI_CPU_FLAG_VSX 0x0002 ///< ISA 2.06 +#define TDI_CPU_FLAG_POWER8 0x0004 ///< ISA 2.07 + +#define TDI_CPU_FLAG_ARMV5TE (1 << 0) +#define TDI_CPU_FLAG_ARMV6 (1 << 1) +#define TDI_CPU_FLAG_ARMV6T2 (1 << 2) +#define TDI_CPU_FLAG_VFP (1 << 3) +#define TDI_CPU_FLAG_VFPV3 (1 << 4) +#define TDI_CPU_FLAG_NEON (1 << 5) +#define TDI_CPU_FLAG_ARMV8 (1 << 6) +#define TDI_CPU_FLAG_VFP_VM (1 << 7) ///< VFPv2 vector mode, deprecated in ARMv7-A and unavailable in various CPUs implementations + +#define CPUEXT_SUFFIX(flags, suffix, cpuext) \ + (HAVE_ ## cpuext ## suffix && ((flags) & TDI_CPU_FLAG_ ## cpuext)) + +#define CPUEXT_SUFFIX_FAST(flags, suffix, cpuext) \ + (HAVE_ ## cpuext ## suffix && ((flags) & TDI_CPU_FLAG_ ## cpuext) && \ + !((flags) & TDI_CPU_FLAG_ ## cpuext ## SLOW)) + +#define CPUEXT_SUFFIX_SLOW(flags, suffix, cpuext) \ + (HAVE_ ## cpuext ## suffix && ((flags) & TDI_CPU_FLAG_ ## cpuext) && \ + ((flags) & TDI_CPU_FLAG_ ## cpuext ## SLOW)) + +#define CPUEXT(flags, cpuext) CPUEXT_SUFFIX(flags, , cpuext) +#define CPUEXT_FAST(flags, cpuext) CPUEXT_SUFFIX_FAST(flags, , cpuext) +#define CPUEXT_SLOW(flags, cpuext) CPUEXT_SUFFIX_SLOW(flags, , cpuext) + +#define X86_AMD3DNOW(flags) CPUEXT(flags, AMD3DNOW) +#define X86_AMD3DNOWEXT(flags) CPUEXT(flags, AMD3DNOWEXT) +#define X86_MMX(flags) CPUEXT(flags, MMX) +#define X86_MMXEXT(flags) CPUEXT(flags, MMXEXT) +#define X86_SSE(flags) CPUEXT(flags, SSE) +#define X86_SSE2(flags) CPUEXT(flags, SSE2) +#define X86_SSE2_FAST(flags) CPUEXT_FAST(flags, SSE2) +#define X86_SSE2_SLOW(flags) CPUEXT_SLOW(flags, SSE2) +#define X86_SSE3(flags) CPUEXT(flags, SSE3) +#define X86_SSE3_FAST(flags) CPUEXT_FAST(flags, SSE3) +#define X86_SSE3_SLOW(flags) CPUEXT_SLOW(flags, SSE3) +#define X86_SSSE3(flags) CPUEXT(flags, SSSE3) +#define X86_SSSE3_FAST(flags) CPUEXT_FAST(flags, SSSE3) +#define X86_SSSE3_SLOW(flags) CPUEXT_SLOW(flags, SSSE3) +#define X86_SSE4(flags) CPUEXT(flags, SSE4) +#define X86_SSE42(flags) CPUEXT(flags, SSE42) +#define X86_AVX(flags) CPUEXT(flags, AVX) +#define X86_AVX_FAST(flags) CPUEXT_FAST(flags, AVX) +#define X86_AVX_SLOW(flags) CPUEXT_SLOW(flags, AVX) +#define X86_XOP(flags) CPUEXT(flags, XOP) +#define X86_FMA3(flags) CPUEXT(flags, FMA3) +#define X86_FMA4(flags) CPUEXT(flags, FMA4) +#define X86_AVX2(flags) CPUEXT(flags, AVX2) + +#define EXTERNAL_AMD3DNOW(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, AMD3DNOW) +#define EXTERNAL_AMD3DNOWEXT(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, AMD3DNOWEXT) +#define EXTERNAL_MMX(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, MMX) +#define EXTERNAL_MMXEXT(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, MMXEXT) +#define EXTERNAL_SSE(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, SSE) +#define EXTERNAL_SSE2(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, SSE2) +#define EXTERNAL_SSE2_FAST(flags) CPUEXT_SUFFIX_FAST(flags, _EXTERNAL, SSE2) +#define EXTERNAL_SSE2_SLOW(flags) CPUEXT_SUFFIX_SLOW(flags, _EXTERNAL, SSE2) +#define EXTERNAL_SSE3(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, SSE3) +#define EXTERNAL_SSE3_FAST(flags) CPUEXT_SUFFIX_FAST(flags, _EXTERNAL, SSE3) +#define EXTERNAL_SSE3_SLOW(flags) CPUEXT_SUFFIX_SLOW(flags, _EXTERNAL, SSE3) +#define EXTERNAL_SSSE3(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, SSSE3) +#define EXTERNAL_SSSE3_FAST(flags) CPUEXT_SUFFIX_FAST(flags, _EXTERNAL, SSSE3) +#define EXTERNAL_SSSE3_SLOW(flags) CPUEXT_SUFFIX_SLOW(flags, _EXTERNAL, SSSE3) +#define EXTERNAL_SSE4(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, SSE4) +#define EXTERNAL_SSE42(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, SSE42) +#define EXTERNAL_AVX(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, AVX) +#define EXTERNAL_AVX_FAST(flags) CPUEXT_SUFFIX_FAST(flags, _EXTERNAL, AVX) +#define EXTERNAL_AVX_SLOW(flags) CPUEXT_SUFFIX_SLOW(flags, _EXTERNAL, AVX) +#define EXTERNAL_XOP(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, XOP) +#define EXTERNAL_FMA3(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, FMA3) +#define EXTERNAL_FMA4(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, FMA4) +#define EXTERNAL_AVX2(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, AVX2) + +int tdi_init_cpu_flags(void); + +void tdi_cpu_cpuid(int index, int *eax, int *ebx, int *ecx, int *edx); +void tdi_cpu_xgetbv(int op, int *eax, int *edx); + +unsigned int tdi_cpu_count(void); + +#endif /* TEUKOLSKY_DATA_CPU_H */ diff --git a/cpuid.asm b/cpuid.asm new file mode 100644 index 0000000..94c35c0 --- /dev/null +++ b/cpuid.asm @@ -0,0 +1,63 @@ +;***************************************************************************** +;* Copyright (C) 2005-2010 x264 project +;* +;* Authors: Loren Merritt <lorenm@u.washington.edu> +;* Fiona Glaser <fiona@x264.com> +;* +;* This file is part of Libav. +;* +;* Libav is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* Libav is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with Libav; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "x86util.asm" + +SECTION .text + +;----------------------------------------------------------------------------- +; void bdi_cpu_cpuid(int index, int *eax, int *ebx, int *ecx, int *edx) +;----------------------------------------------------------------------------- +cglobal cpu_cpuid, 5,7 + push rbx + push r4 + push r3 + push r2 + push r1 + mov eax, r0d + xor ecx, ecx + cpuid + pop r4 + mov [r4], eax + pop r4 + mov [r4], ebx + pop r4 + mov [r4], ecx + pop r4 + mov [r4], edx + pop rbx + RET + +;----------------------------------------------------------------------------- +; void bdi_cpu_xgetbv(int op, int *eax, int *edx) +;----------------------------------------------------------------------------- +cglobal cpu_xgetbv, 3,7 + push r2 + push r1 + mov ecx, r0d + xgetbv + pop r4 + mov [r4], eax + pop r4 + mov [r4], edx + RET @@ -0,0 +1,40 @@ +/* + * Copyright 2014-2017 Anton Khirnov <anton@khirnov.net> + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +/** + * @file + * logging code + */ + +#include <stdarg.h> +#include <stdio.h> + +#include "log.h" + +void tdi_log_default_callback(TDLogger *log, int level, + const char *fmt, va_list vl) +{ + vfprintf(stderr, fmt, vl); +} + +void tdi_log(TDLogger *log, int level, const char *fmt, ...) +{ + va_list vl; + va_start(vl, fmt); + log->log(log, level, fmt, vl); + va_end(vl); +} @@ -0,0 +1,32 @@ +/* + * Copyright 2017 Anton Khirnov <anton@khirnov.net> + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef TEUKOLSKY_DATA_LOG_H +#define TEUKOLSKY_DATA_LOG_H + +#include <stdarg.h> + +typedef struct TDLogger { + void *opaque; + void (*log)(struct TDLogger *log, int level, const char *fmt, va_list vl); +} TDLogger; + +void tdi_log(TDLogger *log, int level, const char *fmt, ...); +void tdi_log_default_callback(TDLogger *log, int level, + const char *fmt, va_list vl); + +#endif // TEUKOLSKY_DATA_LOG_H diff --git a/pssolve.c b/pssolve.c new file mode 100644 index 0000000..f2288c2 --- /dev/null +++ b/pssolve.c @@ -0,0 +1,521 @@ +/* + * Pseudospectral 2nd order 2D linear PDE solver + * Copyright (C) 2016 Anton Khirnov <anton@khirnov.net> + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#include <errno.h> +#include <inttypes.h> +#include <limits.h> +#include <math.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include <cblas.h> +#include <lapacke.h> + +#include "bicgstab.h" +#include "common.h" +#include "log.h" +#include "pssolve.h" +#include "threadpool.h" + +#define NB_COEFFS(eq_ctx) ((eq_ctx)->nb_coeffs[0] * (eq_ctx)->nb_coeffs[1]) +#define NB_COLLOC_POINTS(eq_ctx) ((eq_ctx)->nb_colloc_points[0] * (eq_ctx)->nb_colloc_points[1]) + +typedef struct PSEquationContext { + size_t nb_coeffs[2]; + size_t nb_colloc_points[2]; + unsigned int colloc_grid_order[2]; + + double *(*basis_val)[PSSOLVE_DIFF_ORDER_NB]; + double *mat; +} PSEquationContext; + +struct PSSolvePriv { + BiCGStabContext *bicgstab; + int steps_since_inverse; + + size_t nb_coeffs; + + PSEquationContext *eqs; + + int *ipiv; + double *mat; + + ThreadPoolContext *tp; + ThreadPoolContext *tp_internal; +}; + +typedef struct ConstructMatrixThread { + const PSEquationContext *eq_ctx; + const double **eq_coeffs; + double *mat; + ptrdiff_t mat_stride; + unsigned int var_idx; +} ConstructMatrixThread; + +static void construct_matrix(void *arg, + unsigned int job_idx, unsigned int nb_jobs, + unsigned int thread_idx, unsigned int nb_threads) +{ + ConstructMatrixThread *cmt = arg; + const PSEquationContext *eq_ctx = cmt->eq_ctx; + const double **eq_coeffs = cmt->eq_coeffs; + double *mat = cmt->mat; + ptrdiff_t mat_stride = cmt->mat_stride; + unsigned int var_idx = cmt->var_idx; + unsigned int idx_coeff = job_idx; + + for (int idx_grid = 0; idx_grid < NB_COLLOC_POINTS(eq_ctx); idx_grid++) { + const int idx = idx_grid + NB_COLLOC_POINTS(eq_ctx) * idx_coeff; + double val = 0.0; + + for (int i = 0; i < PSSOLVE_DIFF_ORDER_NB; i++) + val += eq_coeffs[i][idx_grid] * eq_ctx->basis_val[var_idx][i][idx]; + + mat[idx_grid + mat_stride * idx_coeff] = val; + } +} + +static int lu_invert(TDLogger *logger, const int N, double *mat, double *rhs, int *ipiv) +{ + char equed = 'N'; + double cond, ferr, berr, rpivot; + + double *mat_f, *x; + int ret = 0; + +#if 0 + LAPACKE_dgesv(LAPACK_COL_MAJOR, N, 1, + mat, N, ipiv, rhs, N); + LAPACKE_dgetri(LAPACK_COL_MAJOR, N, mat, N, ipiv); +#else + mat_f = malloc(SQR(N) * sizeof(*mat_f)); + x = malloc(N * sizeof(*x)); + + //{ + // int i, j; + // for (i = 0; i < N; i++) { + // for (j = 0; j < N; j++) + // fprintf(stderr, "%+#010.8g\t", mat[i + j * N]); + // fprintf(stderr, "\n"); + // } + //} + //{ + // double *mat_copy = malloc(SQR(N) * sizeof(double)); + // double *svd = malloc(N * sizeof(double)); + // double *rhs_copy = malloc(N * sizeof(double)); + // int rank; + + // memcpy(mat_copy, mat, SQR(N) * sizeof(double)); + // memcpy(rhs_copy, rhs, N * sizeof(double)); + + // LAPACKE_dgelsd(LAPACK_COL_MAJOR, N, N, 1, mat_copy, N, rhs_copy, N, + // svd, 1e-13, &rank); + + // free(mat_copy); + // for (int i = 0; i < N; i++) { + // if (i > 5 && i < N - 5) + // continue; + + // fprintf(stderr, "%g\t", svd[i]); + // } + // fprintf(stderr, "\n rank %d\n", rank); + // free(svd); + // free(rhs_copy); + + // if (rank < N) + // ret = 1; + //} + + //LAPACKE_dgesv(LAPACK_COL_MAJOR, N, 1, + // mat, N, ipiv, rhs, N); + LAPACKE_dgesvx(LAPACK_COL_MAJOR, 'N', 'N', N, 1, + mat, N, mat_f, N, ipiv, &equed, NULL, NULL, + rhs, N, x, N, &cond, &ferr, &berr, &rpivot); + LAPACKE_dgetri(LAPACK_COL_MAJOR, N, mat_f, N, ipiv); + memcpy(rhs, x, N * sizeof(double)); + memcpy(mat, mat_f, SQR(N) * sizeof(double)); + + tdi_log(logger, 1, "LU factorization solution to a %zdx%zd matrix: " + "condition number %16.16g; forward error %16.16g backward error %16.16g\n", + N, N, cond, ferr, berr); + + free(mat_f); + free(x); +#endif + + return ret; +} + +int tdi_pssolve_solve(PSSolveContext *ctx, + const double *(**eq_coeffs)[PSSOLVE_DIFF_ORDER_NB], + const double *rhs, double *coeffs) +{ + PSSolvePriv *s = ctx->priv; + double rhs_max; + int64_t start; + + int ret = 0; + + /* fill the matrix */ + start = gettime(); + + for (int i = 0; i < ctx->nb_equations; i++) { + PSEquationContext *eq_ctx = &s->eqs[i]; + double *mat = s->eqs[i].mat; + + for (int j = 0; j < ctx->nb_equations; j++) { + ConstructMatrixThread thread = { + .eq_ctx = eq_ctx, + .eq_coeffs = eq_coeffs[i][j], + .mat = mat, + .mat_stride = s->nb_coeffs, + .var_idx = j, + }; + tdi_threadpool_execute(s->tp, NB_COEFFS(&s->eqs[j]), construct_matrix, + &thread); + mat += NB_COEFFS(&s->eqs[j]) * s->nb_coeffs; + } + } + + ctx->construct_matrix_time += gettime() - start; + ctx->construct_matrix_count++; + +#if 0 + if (rhs_max < EPS) { + fprintf(stderr, "zero rhs\n"); + memset(ms->coeffs, 0, sizeof(*ms->coeffs) * ms->nb_coeffs); + if (ms->cl_queue) { + clEnqueueWriteBuffer(ms->cl_queue, ms->ocl_coeffs, 1, 0, N * sizeof(double), + ms->coeffs, 0, NULL, NULL); + } + return 0; + } +#endif + + /* solve for the coeffs */ + if (s->steps_since_inverse < 1024) { + int64_t start; + + start = gettime(); + + ret = tdi_bicgstab_solve(s->bicgstab, s->mat, rhs, coeffs); + + if (ret >= 0) { + ctx->cg_time_total += gettime() - start; + ctx->cg_solve_count++; + ctx->cg_iter_count += ret + 1; + s->steps_since_inverse++; + + } + } else + ret = -1; + + if (ret < 0) { + int64_t start; + + start = gettime(); + + memcpy(coeffs, rhs, s->nb_coeffs * sizeof(*rhs)); + + ret = lu_invert(&ctx->logger, s->nb_coeffs, s->mat, coeffs, s->ipiv); + ctx->lu_solves_time += gettime() - start; + ctx->lu_solves_count++; + + ret = tdi_bicgstab_init(s->bicgstab, s->mat, coeffs); + + s->steps_since_inverse = 0; + } + + return ret; +} + +static int basis_val_init(PSSolveContext *ctx, unsigned int eq_idx) +{ + PSSolvePriv *s = ctx->priv; + PSEquationContext *eq_ctx = &s->eqs[eq_idx]; + int ret; + + eq_ctx->basis_val = calloc(ctx->nb_equations, sizeof(*eq_ctx->basis_val)); + if (!eq_ctx->basis_val) + return -ENOMEM; + + for (int i = 0; i < ctx->nb_equations; i++) { + double *basis_val[2][3] = { { NULL } }; + + /* for each direction, compute the corresponding basis values/derivatives */ + for (int dir = 0; dir < ARRAY_ELEMS(basis_val); dir++) { + for (int diff_order = 0; diff_order < ARRAY_ELEMS(basis_val[dir]); diff_order++) { + ret = posix_memalign((void**)&basis_val[dir][diff_order], 32, + sizeof(*basis_val[dir][diff_order]) * s->eqs[i].nb_coeffs[dir] * eq_ctx->nb_colloc_points[dir]); + if (ret) { + ret = -ENOMEM; + goto fail; + } + } + + for (int k = 0; k < eq_ctx->nb_colloc_points[dir]; k++) { + double coord = ctx->colloc_grid[eq_idx][dir][k]; + for (int l = 0; l < s->eqs[i].nb_coeffs[dir]; l++) { + basis_val[dir][0][k * s->eqs[i].nb_coeffs[dir] + l] = tdi_basis_eval(ctx->basis[i][dir], BS_EVAL_TYPE_VALUE, coord, l); + basis_val[dir][1][k * s->eqs[i].nb_coeffs[dir] + l] = tdi_basis_eval(ctx->basis[i][dir], BS_EVAL_TYPE_DIFF1, coord, l); + basis_val[dir][2][k * s->eqs[i].nb_coeffs[dir] + l] = tdi_basis_eval(ctx->basis[i][dir], BS_EVAL_TYPE_DIFF2, coord, l); + } + } + } + + for (int diff = 0; diff < ARRAY_ELEMS(eq_ctx->basis_val[i]); diff++) { + ret = posix_memalign((void**)&eq_ctx->basis_val[i][diff], 32, + NB_COLLOC_POINTS(eq_ctx) * NB_COEFFS(eq_ctx) * sizeof(*eq_ctx->basis_val[i][diff])); + if (ret) { + ret = -ENOMEM; + goto fail; + } + } + + for (int j = 0; j < eq_ctx->nb_colloc_points[1]; j++) { + const double *basis1 = basis_val[1][0] + j * s->eqs[i].nb_coeffs[1]; + const double *dbasis1 = basis_val[1][1] + j * s->eqs[i].nb_coeffs[1]; + const double *d2basis1 = basis_val[1][2] + j * s->eqs[i].nb_coeffs[1]; + + for (int k = 0; k < eq_ctx->nb_colloc_points[0]; k++) { + const double *basis0 = basis_val[0][0] + k * s->eqs[i].nb_coeffs[0]; + const double *dbasis0 = basis_val[0][1] + k * s->eqs[i].nb_coeffs[0]; + const double *d2basis0 = basis_val[0][2] + k * s->eqs[i].nb_coeffs[0]; + + const int idx_grid = j * eq_ctx->nb_colloc_points[0] + k; + + for (int l = 0; l < s->eqs[i].nb_coeffs[1]; l++) + for (int m = 0; m < s->eqs[i].nb_coeffs[0]; m++) { + const int idx_coeff = l * s->eqs[i].nb_coeffs[0] + m; + const int idx = idx_grid + NB_COLLOC_POINTS(eq_ctx) * idx_coeff; + + eq_ctx->basis_val[i][PSSOLVE_DIFF_ORDER_00][idx] = basis0[m] * basis1[l]; + eq_ctx->basis_val[i][PSSOLVE_DIFF_ORDER_10][idx] = dbasis0[m] * basis1[l]; + eq_ctx->basis_val[i][PSSOLVE_DIFF_ORDER_01][idx] = basis0[m] * dbasis1[l]; + eq_ctx->basis_val[i][PSSOLVE_DIFF_ORDER_20][idx] = d2basis0[m] * basis1[l]; + eq_ctx->basis_val[i][PSSOLVE_DIFF_ORDER_02][idx] = basis0[m] * d2basis1[l]; + eq_ctx->basis_val[i][PSSOLVE_DIFF_ORDER_11][idx] = dbasis0[m] * dbasis1[l]; + } + } + } + +fail: + for (int dir = 0; dir < ARRAY_ELEMS(basis_val); dir++) + for (int diff = 0; diff < ARRAY_ELEMS(basis_val[dir]); diff++) + free(basis_val[dir][diff]); + if (ret < 0) + return ret; + } + + return 0; +} + +int tdi_pssolve_context_init(PSSolveContext *ctx) +{ + PSSolvePriv *s = ctx->priv; + size_t N = 0; + + int ret = 0; + + if (ctx->tp) { + s->tp = ctx->tp; + } else { + ret = tdi_threadpool_init(&s->tp_internal, 1); + if (ret < 0) + return ret; + s->tp = s->tp_internal; + } + + /* sanity check the parameters */ + for (int i = 0; i < ctx->nb_equations; i++) { + if (!ctx->basis[i][0] || !ctx->basis[i][1]) { + tdi_log(&ctx->logger, 0, "Basis set for variable %d not set\n", i); + return -EINVAL; + } + if (!ctx->solve_order[i][0] || !ctx->solve_order[i][1]) { + tdi_log(&ctx->logger, 0, "Solver order for variable %d not set\n", i); + return -EINVAL; + } + + N += ctx->solve_order[i][0] * ctx->solve_order[i][1]; + } + + ret = posix_memalign((void**)&s->ipiv, 32, sizeof(*s->ipiv) * N); + ret |= posix_memalign((void**)&s->mat, 32, sizeof(*s->mat) * N * N); + if (ret) + return -ENOMEM; + + s->nb_coeffs = N; + + ctx->colloc_grid = calloc(ctx->nb_equations, sizeof(*ctx->colloc_grid)); + if (!ctx->colloc_grid) + return -ENOMEM; + + /* initialize the per-equation state */ + for (int i = 0; i < ctx->nb_equations; i++) { + PSEquationContext *eq_ctx = &s->eqs[i]; + + eq_ctx->nb_coeffs[0] = ctx->solve_order[i][0]; + eq_ctx->nb_coeffs[1] = ctx->solve_order[i][1]; + eq_ctx->nb_colloc_points[0] = ctx->solve_order[i][0]; + eq_ctx->nb_colloc_points[1] = ctx->solve_order[i][1]; + eq_ctx->colloc_grid_order[0] = ctx->solve_order[i][0]; + eq_ctx->colloc_grid_order[1] = ctx->solve_order[i][1]; + + if (i == 0) + eq_ctx->mat = s->mat; + else + eq_ctx->mat = s->eqs[i - 1].mat + NB_COLLOC_POINTS(&s->eqs[i - 1]); + + /* compute the collocation grid */ + posix_memalign((void**)&ctx->colloc_grid[i][0], 32, eq_ctx->nb_colloc_points[0] * sizeof(*ctx->colloc_grid[i][0])); + posix_memalign((void**)&ctx->colloc_grid[i][1], 32, eq_ctx->nb_colloc_points[1] * sizeof(*ctx->colloc_grid[i][1])); + if (!ctx->colloc_grid[i][0] || !ctx->colloc_grid[i][1]) + return -ENOMEM; + + for (int j = 0; j < eq_ctx->nb_colloc_points[0]; j++) + ctx->colloc_grid[i][0][j] = tdi_basis_colloc_point(ctx->basis[i][0], eq_ctx->colloc_grid_order[0], j); + for (int j = 0; j < eq_ctx->nb_colloc_points[1]; j++) + ctx->colloc_grid[i][1][j] = tdi_basis_colloc_point(ctx->basis[i][1], eq_ctx->colloc_grid_order[1], j); + + } + + /* precompute the basis values we will need */ + for (int i = 0; i < ctx->nb_equations; i++) { + ret = basis_val_init(ctx, i); + if (ret < 0) + return ret; + } + + s->steps_since_inverse = INT_MAX; + + /* init the BiCGStab solver */ + ret = tdi_bicgstab_context_alloc(&s->bicgstab, N, ctx->ocl_ctx, ctx->ocl_queue); + if (ret < 0) + return ret; + + return 0; +} + +int tdi_pssolve_context_alloc(PSSolveContext **pctx, unsigned int nb_equations) +{ + PSSolveContext *ctx; + + if (!nb_equations) + return -EINVAL; + + ctx = calloc(1, sizeof(*ctx)); + if (!ctx) + return -ENOMEM; + + ctx->nb_equations = nb_equations; + + ctx->priv = calloc(1, sizeof(*ctx->priv)); + if (!ctx->priv) + goto fail; + + ctx->priv->eqs = calloc(nb_equations, sizeof(*ctx->priv->eqs)); + if (!ctx->priv->eqs) + goto fail; + + ctx->basis = calloc(nb_equations, sizeof(*ctx->basis)); + if (!ctx->basis) + goto fail; + + ctx->solve_order = calloc(nb_equations, sizeof(*ctx->solve_order)); + if (!ctx->solve_order) + goto fail; + + *pctx = ctx; + return 0; +fail: + tdi_pssolve_context_free(&ctx); + return -ENOMEM; +} + +void tdi_pssolve_context_free(PSSolveContext **pctx) +{ + PSSolveContext *ctx = *pctx; + + if (!ctx) + return; + + if (ctx->priv) { + if (ctx->priv->eqs) { + for (int i = 0; i < ctx->nb_equations; i++) { + PSEquationContext *eq_ctx = &ctx->priv->eqs[i]; + + if (eq_ctx->basis_val) { + for (int j = 0; j < ctx->nb_equations; j++) + for (int k = 0; k < ARRAY_ELEMS(eq_ctx->basis_val[j]); k++) + free(eq_ctx->basis_val[j][k]); + } + free(eq_ctx->basis_val); + } + } + + free(ctx->priv->eqs); + + free(ctx->priv->ipiv); + free(ctx->priv->mat); + + tdi_bicgstab_context_free(&ctx->priv->bicgstab); + tdi_threadpool_free(&ctx->priv->tp_internal); + } + + free(ctx->priv); + + if (ctx->colloc_grid) { + for (int i = 0; i < ctx->nb_equations; i++) + for (int j = 0; j < ARRAY_ELEMS(ctx->colloc_grid[i]); j++) + free(ctx->colloc_grid[i][j]); + } + + free(ctx->colloc_grid); + + free(ctx->basis); + free(ctx->solve_order); + + free(ctx); + *pctx = NULL; +} + +int tdi_pssolve_diff_order(enum PSSolveDiffOrder order, unsigned int dir) +{ + if (dir == 0) { + switch (order) { + case PSSOLVE_DIFF_ORDER_00: + case PSSOLVE_DIFF_ORDER_01: + case PSSOLVE_DIFF_ORDER_02: return 0; + case PSSOLVE_DIFF_ORDER_10: + case PSSOLVE_DIFF_ORDER_11: return 1; + case PSSOLVE_DIFF_ORDER_20: return 2; + } + } else if (dir == 1) { + switch (order) { + case PSSOLVE_DIFF_ORDER_00: + case PSSOLVE_DIFF_ORDER_10: + case PSSOLVE_DIFF_ORDER_20: return 0; + case PSSOLVE_DIFF_ORDER_01: + case PSSOLVE_DIFF_ORDER_11: return 1; + case PSSOLVE_DIFF_ORDER_02: return 2; + } + } + return -1; +} diff --git a/pssolve.h b/pssolve.h new file mode 100644 index 0000000..6d5d1c0 --- /dev/null +++ b/pssolve.h @@ -0,0 +1,188 @@ +/* + * Pseudospectral solver for 2nd order 2D linear PDE systems + * Copyright (C) 2014-2017 Anton Khirnov <anton@khirnov.net> + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef TEUKOLSKY_PSSOLVE_H +#define TEUKOLSKY_PSSOLVE_H + +/** + * The problem being solved is a sequence of N linear partial differential + * equations + * + * / \ + * โ | โ C_{ab}^{ik} โ_aใโ_b u_k + โ C_{a}^{ik} โ_a u_k + C^{ik} u_k | = S^i + * k \ a,b a / + * + * where + * * i numbers the equations and runs from 0 to N-1 + * * k numbers the unknown functions and also runs from 0 to N-1 + * * a and b identify spatial directions and run from 0 to 1 + * * u_k = u_k(x_a) is the k-th unknown function + * * C_{ab}^{ik}, C_{a}^{ik} and C^{ik} are the coefficients in front of + * the corresponding derivative of k-th unknown function in the i-th + * equation + * * S^i is the right-hand side of the i-th equation + * C_{*}^{ik} and S^i are all (known) functions of space and define the + * equations to be solved. + */ + +#include "config.h" + +#if HAVE_OPENCL +#include <cl.h> +#else +typedef void* cl_context; +typedef void* cl_command_queue; +#endif + +#include <stdint.h> + +#include "basis.h" +#include "log.h" +#include "threadpool.h" + +enum PSSolveDiffOrder { + PSSOLVE_DIFF_ORDER_00, + PSSOLVE_DIFF_ORDER_10, + PSSOLVE_DIFF_ORDER_01, + PSSOLVE_DIFF_ORDER_11, + PSSOLVE_DIFF_ORDER_20, + PSSOLVE_DIFF_ORDER_02, + PSSOLVE_DIFF_ORDER_NB, +}; + +typedef struct PSSolvePriv PSSolvePriv; + +typedef struct PSSolveContext { + /** + * Solver private data, not to be touched by the caller. + */ + PSSolvePriv *priv; + + /** + * The logging context. + * Set by the caller before tdi_pssolve_context_init(). + */ + TDLogger logger; + + /** + * Number of equations/unknown functions in the set. + * Set by tdi_pssolve_context_alloc(). + */ + unsigned int nb_equations; + + /** + * The basis sets. + * + * basis[i][j] is the basis set used for i-th variable in j-th direction. + * + * The array is allocated by tdi_pssolve_context_alloc(), must be filled by + * by the caller before tdi_pssolve_context_init(). + */ + const BasisSetContext *(*basis)[2]; + + /** + * Order of the solver. + * + * solve_order[i][j] is the order of the solver (i.e. the number of the + * basis functions used) for i-th variable in j-th direction. + * + * Allocated by tdi_pssolve_context_alloc(), must be filled by the caller + * before tdi_pssolve_context_init(). + */ + unsigned int (*solve_order)[2]; + + /** + * Locations of the collocation points. The equation coefficients passed to + * tdi_pssolve_solve() should be evaluated at those grid positions. + * + * colloc_grid[i][j] is an array of length solve_order[i][j] and contains + * the collocation points for the i-th equation in the j-th direction. + * + * Set by the solver after tdi_pssolve_context_init(). + */ + double *(*colloc_grid)[2]; + + /** + * The thread pool used for multithreaded execution. May be set by the + * caller before tdi_pssolve_context_init(), otherwise a single thread will + * be used. + */ + ThreadPoolContext *tp; + + cl_context ocl_ctx; + cl_command_queue ocl_queue; + + uint64_t lu_solves_count; + uint64_t lu_solves_time; + + uint64_t cg_solve_count; + uint64_t cg_iter_count; + uint64_t cg_time_total; + + uint64_t construct_matrix_count; + uint64_t construct_matrix_time; +} PSSolveContext; + +/** + * Allocate a new solver. + * + * @param ctx The newly allocated solver context will be written here. + * @param nb_equations number of equations to solve (equal to the number of + * unknown functions to solve for) + * + * @return 0 on success a negative error code on failure. + */ +int tdi_pssolve_context_alloc(PSSolveContext **ctx, unsigned int nb_equations); + +/** + * Initialize the solver for use after all the context options have been set. + * This function must be called exactly once before any calls to + * tdi_pssolve_solve(). + * + * @return 0 on success, a negative error code on failure. + */ +int tdi_pssolve_context_init(PSSolveContext *ctx); + +/** + * Free the solver and write NULL to the supplied pointer. + */ +void tdi_pssolve_context_free(PSSolveContext **ctx); + +/** + * Solve a PDE. This function may be called multiple times in succession to + * solve multiple related PDEs (it will be efficient if the equation + * coefficients do not change too much). + * + * @param ctx the solver context + * @param eq_coeffs the equation coefficients at the collocation points. + * eq_coeffs[i][j][k] is the array of coefficients for the k-th + * derivative (as per enum PSSolveDiffOrder) of the j-th + * unknown function in the i-th equation. + * @param rhs the right-hand side of the equation at the collocation points. + * @param coeffs the spectral coefficients of the solution will be written here. + * + * @return 0 on success, a negative error code on failure. The contents of + * coeffs are undefined on failure. + */ +int tdi_pssolve_solve(PSSolveContext *ctx, + const double *(**eq_coeffs)[PSSOLVE_DIFF_ORDER_NB], + const double *rhs, double *coeffs); + +int tdi_pssolve_diff_order(enum PSSolveDiffOrder order, unsigned int dir); + +#endif /* TEUKOLSKY_PSSOLVE_H */ diff --git a/tests/pssolve.c b/tests/pssolve.c new file mode 100644 index 0000000..94afadd --- /dev/null +++ b/tests/pssolve.c @@ -0,0 +1,139 @@ +/* + * Copyright 2017 Anton Khirnov <anton@khirnov.net> + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#include <math.h> +#include <stdio.h> +#include <stdlib.h> + +#include "basis.h" +#include "common.h" +#include "log.h" +#include "pssolve.h" + +#define N_X 8 +#define N_Z 4 +#define FUNC_X 4 +#define FUNC_Z 2 +#define TOL 5e-15 +static int scalar0(void) +{ + BasisSetContext *basis = NULL; + PSSolveContext *ctx = NULL; + + double eq_coeffs00[PSSOLVE_DIFF_ORDER_NB][N_X * N_Z]; + const double *eq_coeffs0[PSSOLVE_DIFF_ORDER_NB]; + const double *(*eq_coeffs)[PSSOLVE_DIFF_ORDER_NB]; + double rhs[N_X * N_Z]; + double coeffs[N_X * N_Z]; + int ret; + + ret = tdi_basis_init(&basis, BASIS_FAMILY_SB_EVEN, 1.0); + if (ret < 0) + goto finish; + + ret = tdi_pssolve_context_alloc(&ctx, 1); + if (ret < 0) + return ret; + + ctx->basis[0][0] = basis; + ctx->basis[0][1] = basis; + + ctx->solve_order[0][0] = N_X; + ctx->solve_order[0][1] = N_Z; + + ctx->logger.log = tdi_log_default_callback; + + ret = tdi_pssolve_context_init(ctx); + if (ret < 0) + goto finish; + + for (int j = 0; j < N_Z; j++) { + const double z = ctx->colloc_grid[0][1][j]; + + for (int i = 0; i < N_X; i++) { + const double x = ctx->colloc_grid[0][0][i]; + + const int idx = j * N_X + i; + + eq_coeffs00[PSSOLVE_DIFF_ORDER_20][idx] = 1.0; + eq_coeffs00[PSSOLVE_DIFF_ORDER_02][idx] = 1.0; + eq_coeffs00[PSSOLVE_DIFF_ORDER_10][idx] = 0.0; + eq_coeffs00[PSSOLVE_DIFF_ORDER_01][idx] = 0.0; + eq_coeffs00[PSSOLVE_DIFF_ORDER_11][idx] = 0.0; + eq_coeffs00[PSSOLVE_DIFF_ORDER_00][idx] = 0.0; + + rhs[idx] = tdi_basis_eval(basis, BS_EVAL_TYPE_DIFF2, x, FUNC_X) * + tdi_basis_eval(basis, BS_EVAL_TYPE_VALUE, z, FUNC_Z) + + tdi_basis_eval(basis, BS_EVAL_TYPE_VALUE, x, FUNC_X) * + tdi_basis_eval(basis, BS_EVAL_TYPE_DIFF2, z, FUNC_Z); + } + } + for (int i = 0; i < PSSOLVE_DIFF_ORDER_NB; i++) + eq_coeffs0[i] = eq_coeffs00[i]; + eq_coeffs = &eq_coeffs0; + + ret = tdi_pssolve_solve(ctx, &eq_coeffs, rhs, coeffs); + if (ret < 0) + goto finish; + + for (int j = 0; j < N_Z; j++) + for (int i = 0; i < N_X; i++) { + const int idx = j * N_X + i; + const double val = (i == FUNC_X && j == FUNC_Z) ? 1.0 : 0.0; + + if (fabs(coeffs[idx] - val) > TOL) { + fprintf(stderr, "unexpected value %g at %d/%d\n", + coeffs[idx], i, j); + ret = -1; + goto finish; + } + } + +finish: + tdi_pssolve_context_free(&ctx); + tdi_basis_free(&basis); + + return ret; +} +#undef N +#undef FUNC_X +#undef FUNC_Z +#undef TOL + +static const struct { + const char *name; + int (*test)(void); +} tests[] = { + { "scalar0", scalar0 }, +}; + +int main(void) +{ + int ret = 0; + + for (int i = 0; i < ARRAY_ELEMS(tests); i++) { + fprintf(stderr, "executing test '%s'\n", tests[i].name); + ret = tests[i].test(); + if (ret < 0) { + fprintf(stderr, "test '%s' failed\n", tests[i].name); + return -1; + } + fprintf(stderr, "test '%s' succeeded\n", tests[i].name); + } + + return 0; +} diff --git a/threadpool.c b/threadpool.c new file mode 100644 index 0000000..ab5cf1d --- /dev/null +++ b/threadpool.c @@ -0,0 +1,178 @@ +/* + * Copyright 2016 Anton Khirnov <anton@khirnov.net> + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#include <errno.h> +#include <pthread.h> +#include <stdlib.h> + +#include "cpu.h" +#include "threadpool.h" + +typedef struct WorkerContext { + ThreadPoolContext *parent; + pthread_t thread; + unsigned int idx; +} WorkerContext; + +struct ThreadPoolContext { + WorkerContext *workers; + unsigned int nb_workers; + + pthread_mutex_t mutex; + pthread_cond_t cond; + void (*func)(void *arg, + unsigned int job_idx, unsigned int nb_jobs, + unsigned int thread_idx, unsigned int nb_threads); + void *func_arg; + int next_job; + int nb_jobs; + int nb_jobs_finished; + + int finish; +}; + +void *worker_thread(void *arg) +{ + WorkerContext *w = arg; + ThreadPoolContext *ctx = w->parent; + int nb_jobs, job_idx; + + while (1) { + pthread_mutex_lock(&ctx->mutex); + while (!ctx->finish && ctx->next_job >= ctx->nb_jobs) + pthread_cond_wait(&ctx->cond, &ctx->mutex); + + if (ctx->finish) { + pthread_mutex_unlock(&ctx->mutex); + break; + } + + nb_jobs = ctx->nb_jobs; + job_idx = ctx->next_job++; + + pthread_mutex_unlock(&ctx->mutex); + + ctx->func(ctx->func_arg, job_idx, nb_jobs, w->idx, ctx->nb_workers); + + pthread_mutex_lock(&ctx->mutex); + + ctx->nb_jobs_finished++; + + pthread_cond_broadcast(&ctx->cond); + pthread_mutex_unlock(&ctx->mutex); + } + return NULL; +} + +int tdi_threadpool_init(ThreadPoolContext **pctx, unsigned int nb_threads) +{ + ThreadPoolContext *ctx; + int ret = 0; + + if (!nb_threads) { + nb_threads = tdi_cpu_count(); + if (!nb_threads) + return -ENOSYS; + } + + ctx = calloc(1, sizeof(*ctx)); + if (!ctx) + return -ENOMEM; + + pthread_mutex_init(&ctx->mutex, NULL); + pthread_cond_init(&ctx->cond, NULL); + + ctx->workers = calloc(nb_threads, sizeof(*ctx->workers)); + if (!ctx->workers) { + ret = -ENOMEM; + goto fail; + } + + for (int i = 0; i < nb_threads; i++) { + WorkerContext *w = &ctx->workers[i]; + + w->idx = i; + w->parent = ctx; + + ret = pthread_create(&w->thread, NULL, worker_thread, w); + if (ret) { + ret = -ret; + goto fail; + } + + ctx->nb_workers++; + } + + + *pctx = ctx; + return 0; +fail: + tdi_threadpool_free(&ctx); + return ret; +} + +void tdi_threadpool_free(ThreadPoolContext **pctx) +{ + ThreadPoolContext *ctx = *pctx; + + if (!ctx) + return; + + pthread_mutex_lock(&ctx->mutex); + ctx->finish = 1; + pthread_cond_broadcast(&ctx->cond); + pthread_mutex_unlock(&ctx->mutex); + + + for (int i = 0; i < ctx->nb_workers; i++) { + WorkerContext *w = &ctx->workers[i]; + pthread_join(w->thread, NULL); + } + + pthread_mutex_destroy(&ctx->mutex); + pthread_cond_destroy(&ctx->cond); + + free(ctx->workers); + + free(ctx); + *pctx = NULL; +} + +void tdi_threadpool_execute(ThreadPoolContext *ctx, unsigned int nb_jobs, + void (*func)(void *arg, + unsigned int job_idx, unsigned int nb_jobs, + unsigned int thread_idx, unsigned int nb_threads), + void *arg) +{ + pthread_mutex_lock(&ctx->mutex); + + ctx->func = func; + ctx->func_arg = arg; + + ctx->nb_jobs = nb_jobs; + ctx->nb_jobs_finished = 0; + ctx->next_job = 0; + + pthread_cond_broadcast(&ctx->cond); + while (ctx->nb_jobs_finished < ctx->nb_jobs) + pthread_cond_wait(&ctx->cond, &ctx->mutex); + + ctx->func = NULL; + ctx->func_arg = NULL; + + pthread_mutex_unlock(&ctx->mutex); +} diff --git a/threadpool.h b/threadpool.h new file mode 100644 index 0000000..b22caab --- /dev/null +++ b/threadpool.h @@ -0,0 +1,32 @@ +/* + * Copyright 2016 Anton Khirnov <anton@khirnov.net> + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef TEUKOLSKY_DATA_THREADPOOL_H +#define TEUKOLSKY_DATA_THREADPOOL_H + +typedef struct ThreadPoolContext ThreadPoolContext; + +int tdi_threadpool_init(ThreadPoolContext **ctx, unsigned int nb_threads); +void tdi_threadpool_free(ThreadPoolContext **ctx); + +void tdi_threadpool_execute(ThreadPoolContext *ctx, unsigned int nb_jobs, + void (*func)(void *arg, + unsigned int job_idx, unsigned int nb_jobs, + unsigned int thread_idx, unsigned int nb_threads), + void *arg); + +#endif /* TEUKOLSKY_DATA_THREADPOOL_H */ diff --git a/x86inc.asm b/x86inc.asm new file mode 100644 index 0000000..dca1f78 --- /dev/null +++ b/x86inc.asm @@ -0,0 +1,1544 @@ +;***************************************************************************** +;* x86inc.asm: x264asm abstraction layer +;***************************************************************************** +;* Copyright (C) 2005-2016 x264 project +;* +;* Authors: Loren Merritt <lorenm@u.washington.edu> +;* Anton Mitrofanov <BugMaster@narod.ru> +;* Fiona Glaser <fiona@x264.com> +;* Henrik Gramner <henrik@gramner.com> +;* +;* Permission to use, copy, modify, and/or distribute this software for any +;* purpose with or without fee is hereby granted, provided that the above +;* copyright notice and this permission notice appear in all copies. +;* +;* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +;* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +;* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR +;* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +;* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +;* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF +;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +;***************************************************************************** + +; This is a header file for the x264ASM assembly language, which uses +; NASM/YASM syntax combined with a large number of macros to provide easy +; abstraction between different calling conventions (x86_32, win64, linux64). +; It also has various other useful features to simplify writing the kind of +; DSP functions that are most often used in x264. + +; Unlike the rest of x264, this file is available under an ISC license, as it +; has significant usefulness outside of x264 and we want it to be available +; to the largest audience possible. Of course, if you modify it for your own +; purposes to add a new feature, we strongly encourage contributing a patch +; as this feature might be useful for others as well. Send patches or ideas +; to x264-devel@videolan.org . + +%ifndef private_prefix + %define private_prefix x264 +%endif + +%ifndef public_prefix + %define public_prefix private_prefix +%endif + +%if HAVE_ALIGNED_STACK + %define STACK_ALIGNMENT 16 +%endif +%ifndef STACK_ALIGNMENT + %if ARCH_X86_64 + %define STACK_ALIGNMENT 16 + %else + %define STACK_ALIGNMENT 4 + %endif +%endif + +%define WIN64 0 +%define UNIX64 0 +%if ARCH_X86_64 + %ifidn __OUTPUT_FORMAT__,win32 + %define WIN64 1 + %elifidn __OUTPUT_FORMAT__,win64 + %define WIN64 1 + %elifidn __OUTPUT_FORMAT__,x64 + %define WIN64 1 + %else + %define UNIX64 1 + %endif +%endif + +%define FORMAT_ELF 0 +%ifidn __OUTPUT_FORMAT__,elf + %define FORMAT_ELF 1 +%elifidn __OUTPUT_FORMAT__,elf32 + %define FORMAT_ELF 1 +%elifidn __OUTPUT_FORMAT__,elf64 + %define FORMAT_ELF 1 +%endif + +%ifdef PREFIX + %define mangle(x) _ %+ x +%else + %define mangle(x) x +%endif + +; aout does not support align= +; NOTE: This section is out of sync with x264, in order to +; keep supporting OS/2. +%macro SECTION_RODATA 0-1 16 + %ifidn __OUTPUT_FORMAT__,aout + section .text + %else + SECTION .rodata align=%1 + %endif +%endmacro + +%if WIN64 + %define PIC +%elif ARCH_X86_64 == 0 +; x86_32 doesn't require PIC. +; Some distros prefer shared objects to be PIC, but nothing breaks if +; the code contains a few textrels, so we'll skip that complexity. + %undef PIC +%endif +%ifdef PIC + default rel +%endif + +%macro CPUNOP 1 + %if HAVE_CPUNOP + CPU %1 + %endif +%endmacro + +; Macros to eliminate most code duplication between x86_32 and x86_64: +; Currently this works only for leaf functions which load all their arguments +; into registers at the start, and make no other use of the stack. Luckily that +; covers most of x264's asm. + +; PROLOGUE: +; %1 = number of arguments. loads them from stack if needed. +; %2 = number of registers used. pushes callee-saved regs if needed. +; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed. +; %4 = (optional) stack size to be allocated. The stack will be aligned before +; allocating the specified stack size. If the required stack alignment is +; larger than the known stack alignment the stack will be manually aligned +; and an extra register will be allocated to hold the original stack +; pointer (to not invalidate r0m etc.). To prevent the use of an extra +; register as stack pointer, request a negative stack size. +; %4+/%5+ = list of names to define to registers +; PROLOGUE can also be invoked by adding the same options to cglobal + +; e.g. +; cglobal foo, 2,3,7,0x40, dst, src, tmp +; declares a function (foo) that automatically loads two arguments (dst and +; src) into registers, uses one additional register (tmp) plus 7 vector +; registers (m0-m6) and allocates 0x40 bytes of stack space. + +; TODO Some functions can use some args directly from the stack. If they're the +; last args then you can just not declare them, but if they're in the middle +; we need more flexible macro. + +; RET: +; Pops anything that was pushed by PROLOGUE, and returns. + +; REP_RET: +; Use this instead of RET if it's a branch target. + +; registers: +; rN and rNq are the native-size register holding function argument N +; rNd, rNw, rNb are dword, word, and byte size +; rNh is the high 8 bits of the word size +; rNm is the original location of arg N (a register or on the stack), dword +; rNmp is native size + +%macro DECLARE_REG 2-3 + %define r%1q %2 + %define r%1d %2d + %define r%1w %2w + %define r%1b %2b + %define r%1h %2h + %define %2q %2 + %if %0 == 2 + %define r%1m %2d + %define r%1mp %2 + %elif ARCH_X86_64 ; memory + %define r%1m [rstk + stack_offset + %3] + %define r%1mp qword r %+ %1 %+ m + %else + %define r%1m [rstk + stack_offset + %3] + %define r%1mp dword r %+ %1 %+ m + %endif + %define r%1 %2 +%endmacro + +%macro DECLARE_REG_SIZE 3 + %define r%1q r%1 + %define e%1q r%1 + %define r%1d e%1 + %define e%1d e%1 + %define r%1w %1 + %define e%1w %1 + %define r%1h %3 + %define e%1h %3 + %define r%1b %2 + %define e%1b %2 + %if ARCH_X86_64 == 0 + %define r%1 e%1 + %endif +%endmacro + +DECLARE_REG_SIZE ax, al, ah +DECLARE_REG_SIZE bx, bl, bh +DECLARE_REG_SIZE cx, cl, ch +DECLARE_REG_SIZE dx, dl, dh +DECLARE_REG_SIZE si, sil, null +DECLARE_REG_SIZE di, dil, null +DECLARE_REG_SIZE bp, bpl, null + +; t# defines for when per-arch register allocation is more complex than just function arguments + +%macro DECLARE_REG_TMP 1-* + %assign %%i 0 + %rep %0 + CAT_XDEFINE t, %%i, r%1 + %assign %%i %%i+1 + %rotate 1 + %endrep +%endmacro + +%macro DECLARE_REG_TMP_SIZE 0-* + %rep %0 + %define t%1q t%1 %+ q + %define t%1d t%1 %+ d + %define t%1w t%1 %+ w + %define t%1h t%1 %+ h + %define t%1b t%1 %+ b + %rotate 1 + %endrep +%endmacro + +DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14 + +%if ARCH_X86_64 + %define gprsize 8 +%else + %define gprsize 4 +%endif + +%macro PUSH 1 + push %1 + %ifidn rstk, rsp + %assign stack_offset stack_offset+gprsize + %endif +%endmacro + +%macro POP 1 + pop %1 + %ifidn rstk, rsp + %assign stack_offset stack_offset-gprsize + %endif +%endmacro + +%macro PUSH_IF_USED 1-* + %rep %0 + %if %1 < regs_used + PUSH r%1 + %endif + %rotate 1 + %endrep +%endmacro + +%macro POP_IF_USED 1-* + %rep %0 + %if %1 < regs_used + pop r%1 + %endif + %rotate 1 + %endrep +%endmacro + +%macro LOAD_IF_USED 1-* + %rep %0 + %if %1 < num_args + mov r%1, r %+ %1 %+ mp + %endif + %rotate 1 + %endrep +%endmacro + +%macro SUB 2 + sub %1, %2 + %ifidn %1, rstk + %assign stack_offset stack_offset+(%2) + %endif +%endmacro + +%macro ADD 2 + add %1, %2 + %ifidn %1, rstk + %assign stack_offset stack_offset-(%2) + %endif +%endmacro + +%macro movifnidn 2 + %ifnidn %1, %2 + mov %1, %2 + %endif +%endmacro + +%macro movsxdifnidn 2 + %ifnidn %1, %2 + movsxd %1, %2 + %endif +%endmacro + +%macro ASSERT 1 + %if (%1) == 0 + %error assertion ``%1'' failed + %endif +%endmacro + +%macro DEFINE_ARGS 0-* + %ifdef n_arg_names + %assign %%i 0 + %rep n_arg_names + CAT_UNDEF arg_name %+ %%i, q + CAT_UNDEF arg_name %+ %%i, d + CAT_UNDEF arg_name %+ %%i, w + CAT_UNDEF arg_name %+ %%i, h + CAT_UNDEF arg_name %+ %%i, b + CAT_UNDEF arg_name %+ %%i, m + CAT_UNDEF arg_name %+ %%i, mp + CAT_UNDEF arg_name, %%i + %assign %%i %%i+1 + %endrep + %endif + + %xdefine %%stack_offset stack_offset + %undef stack_offset ; so that the current value of stack_offset doesn't get baked in by xdefine + %assign %%i 0 + %rep %0 + %xdefine %1q r %+ %%i %+ q + %xdefine %1d r %+ %%i %+ d + %xdefine %1w r %+ %%i %+ w + %xdefine %1h r %+ %%i %+ h + %xdefine %1b r %+ %%i %+ b + %xdefine %1m r %+ %%i %+ m + %xdefine %1mp r %+ %%i %+ mp + CAT_XDEFINE arg_name, %%i, %1 + %assign %%i %%i+1 + %rotate 1 + %endrep + %xdefine stack_offset %%stack_offset + %assign n_arg_names %0 +%endmacro + +%define required_stack_alignment ((mmsize + 15) & ~15) + +%macro ALLOC_STACK 1-2 0 ; stack_size, n_xmm_regs (for win64 only) + %ifnum %1 + %if %1 != 0 + %assign %%pad 0 + %assign stack_size %1 + %if stack_size < 0 + %assign stack_size -stack_size + %endif + %if WIN64 + %assign %%pad %%pad + 32 ; shadow space + %if mmsize != 8 + %assign xmm_regs_used %2 + %if xmm_regs_used > 8 + %assign %%pad %%pad + (xmm_regs_used-8)*16 ; callee-saved xmm registers + %endif + %endif + %endif + %if required_stack_alignment <= STACK_ALIGNMENT + ; maintain the current stack alignment + %assign stack_size_padded stack_size + %%pad + ((-%%pad-stack_offset-gprsize) & (STACK_ALIGNMENT-1)) + SUB rsp, stack_size_padded + %else + %assign %%reg_num (regs_used - 1) + %xdefine rstk r %+ %%reg_num + ; align stack, and save original stack location directly above + ; it, i.e. in [rsp+stack_size_padded], so we can restore the + ; stack in a single instruction (i.e. mov rsp, rstk or mov + ; rsp, [rsp+stack_size_padded]) + %if %1 < 0 ; need to store rsp on stack + %xdefine rstkm [rsp + stack_size + %%pad] + %assign %%pad %%pad + gprsize + %else ; can keep rsp in rstk during whole function + %xdefine rstkm rstk + %endif + %assign stack_size_padded stack_size + ((%%pad + required_stack_alignment-1) & ~(required_stack_alignment-1)) + mov rstk, rsp + and rsp, ~(required_stack_alignment-1) + sub rsp, stack_size_padded + movifnidn rstkm, rstk + %endif + WIN64_PUSH_XMM + %endif + %endif +%endmacro + +%macro SETUP_STACK_POINTER 1 + %ifnum %1 + %if %1 != 0 && required_stack_alignment > STACK_ALIGNMENT + %if %1 > 0 + %assign regs_used (regs_used + 1) + %endif + %if ARCH_X86_64 && regs_used < 5 + UNIX64 * 3 + ; Ensure that we don't clobber any registers containing arguments. For UNIX64 we also preserve r6 (rax) + ; since it's used as a hidden argument in vararg functions to specify the number of vector registers used. + %assign regs_used 5 + UNIX64 * 3 + %endif + %endif + %endif +%endmacro + +%macro DEFINE_ARGS_INTERNAL 3+ + %ifnum %2 + DEFINE_ARGS %3 + %elif %1 == 4 + DEFINE_ARGS %2 + %elif %1 > 4 + DEFINE_ARGS %2, %3 + %endif +%endmacro + +%if WIN64 ; Windows x64 ;================================================= + +DECLARE_REG 0, rcx +DECLARE_REG 1, rdx +DECLARE_REG 2, R8 +DECLARE_REG 3, R9 +DECLARE_REG 4, R10, 40 +DECLARE_REG 5, R11, 48 +DECLARE_REG 6, rax, 56 +DECLARE_REG 7, rdi, 64 +DECLARE_REG 8, rsi, 72 +DECLARE_REG 9, rbx, 80 +DECLARE_REG 10, rbp, 88 +DECLARE_REG 11, R12, 96 +DECLARE_REG 12, R13, 104 +DECLARE_REG 13, R14, 112 +DECLARE_REG 14, R15, 120 + +%macro PROLOGUE 2-5+ 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names... + %assign num_args %1 + %assign regs_used %2 + ASSERT regs_used >= num_args + SETUP_STACK_POINTER %4 + ASSERT regs_used <= 15 + PUSH_IF_USED 7, 8, 9, 10, 11, 12, 13, 14 + ALLOC_STACK %4, %3 + %if mmsize != 8 && stack_size == 0 + WIN64_SPILL_XMM %3 + %endif + LOAD_IF_USED 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 + DEFINE_ARGS_INTERNAL %0, %4, %5 +%endmacro + +%macro WIN64_PUSH_XMM 0 + ; Use the shadow space to store XMM6 and XMM7, the rest needs stack space allocated. + %if xmm_regs_used > 6 + movaps [rstk + stack_offset + 8], xmm6 + %endif + %if xmm_regs_used > 7 + movaps [rstk + stack_offset + 24], xmm7 + %endif + %if xmm_regs_used > 8 + %assign %%i 8 + %rep xmm_regs_used-8 + movaps [rsp + (%%i-8)*16 + stack_size + 32], xmm %+ %%i + %assign %%i %%i+1 + %endrep + %endif +%endmacro + +%macro WIN64_SPILL_XMM 1 + %assign xmm_regs_used %1 + ASSERT xmm_regs_used <= 16 + %if xmm_regs_used > 8 + ; Allocate stack space for callee-saved xmm registers plus shadow space and align the stack. + %assign %%pad (xmm_regs_used-8)*16 + 32 + %assign stack_size_padded %%pad + ((-%%pad-stack_offset-gprsize) & (STACK_ALIGNMENT-1)) + SUB rsp, stack_size_padded + %endif + WIN64_PUSH_XMM +%endmacro + +%macro WIN64_RESTORE_XMM_INTERNAL 1 + %assign %%pad_size 0 + %if xmm_regs_used > 8 + %assign %%i xmm_regs_used + %rep xmm_regs_used-8 + %assign %%i %%i-1 + movaps xmm %+ %%i, [%1 + (%%i-8)*16 + stack_size + 32] + %endrep + %endif + %if stack_size_padded > 0 + %if stack_size > 0 && required_stack_alignment > STACK_ALIGNMENT + mov rsp, rstkm + %else + add %1, stack_size_padded + %assign %%pad_size stack_size_padded + %endif + %endif + %if xmm_regs_used > 7 + movaps xmm7, [%1 + stack_offset - %%pad_size + 24] + %endif + %if xmm_regs_used > 6 + movaps xmm6, [%1 + stack_offset - %%pad_size + 8] + %endif +%endmacro + +%macro WIN64_RESTORE_XMM 1 + WIN64_RESTORE_XMM_INTERNAL %1 + %assign stack_offset (stack_offset-stack_size_padded) + %assign xmm_regs_used 0 +%endmacro + +%define has_epilogue regs_used > 7 || xmm_regs_used > 6 || mmsize == 32 || stack_size > 0 + +%macro RET 0 + WIN64_RESTORE_XMM_INTERNAL rsp + POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7 + %if mmsize == 32 + vzeroupper + %endif + AUTO_REP_RET +%endmacro + +%elif ARCH_X86_64 ; *nix x64 ;============================================= + +DECLARE_REG 0, rdi +DECLARE_REG 1, rsi +DECLARE_REG 2, rdx +DECLARE_REG 3, rcx +DECLARE_REG 4, R8 +DECLARE_REG 5, R9 +DECLARE_REG 6, rax, 8 +DECLARE_REG 7, R10, 16 +DECLARE_REG 8, R11, 24 +DECLARE_REG 9, rbx, 32 +DECLARE_REG 10, rbp, 40 +DECLARE_REG 11, R12, 48 +DECLARE_REG 12, R13, 56 +DECLARE_REG 13, R14, 64 +DECLARE_REG 14, R15, 72 + +%macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, [stack_size,] arg_names... + %assign num_args %1 + %assign regs_used %2 + ASSERT regs_used >= num_args + SETUP_STACK_POINTER %4 + ASSERT regs_used <= 15 + PUSH_IF_USED 9, 10, 11, 12, 13, 14 + ALLOC_STACK %4 + LOAD_IF_USED 6, 7, 8, 9, 10, 11, 12, 13, 14 + DEFINE_ARGS_INTERNAL %0, %4, %5 +%endmacro + +%define has_epilogue regs_used > 9 || mmsize == 32 || stack_size > 0 + +%macro RET 0 + %if stack_size_padded > 0 + %if required_stack_alignment > STACK_ALIGNMENT + mov rsp, rstkm + %else + add rsp, stack_size_padded + %endif + %endif + POP_IF_USED 14, 13, 12, 11, 10, 9 + %if mmsize == 32 + vzeroupper + %endif + AUTO_REP_RET +%endmacro + +%else ; X86_32 ;============================================================== + +DECLARE_REG 0, eax, 4 +DECLARE_REG 1, ecx, 8 +DECLARE_REG 2, edx, 12 +DECLARE_REG 3, ebx, 16 +DECLARE_REG 4, esi, 20 +DECLARE_REG 5, edi, 24 +DECLARE_REG 6, ebp, 28 +%define rsp esp + +%macro DECLARE_ARG 1-* + %rep %0 + %define r%1m [rstk + stack_offset + 4*%1 + 4] + %define r%1mp dword r%1m + %rotate 1 + %endrep +%endmacro + +DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14 + +%macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, [stack_size,] arg_names... + %assign num_args %1 + %assign regs_used %2 + ASSERT regs_used >= num_args + %if num_args > 7 + %assign num_args 7 + %endif + %if regs_used > 7 + %assign regs_used 7 + %endif + SETUP_STACK_POINTER %4 + ASSERT regs_used <= 7 + PUSH_IF_USED 3, 4, 5, 6 + ALLOC_STACK %4 + LOAD_IF_USED 0, 1, 2, 3, 4, 5, 6 + DEFINE_ARGS_INTERNAL %0, %4, %5 +%endmacro + +%define has_epilogue regs_used > 3 || mmsize == 32 || stack_size > 0 + +%macro RET 0 + %if stack_size_padded > 0 + %if required_stack_alignment > STACK_ALIGNMENT + mov rsp, rstkm + %else + add rsp, stack_size_padded + %endif + %endif + POP_IF_USED 6, 5, 4, 3 + %if mmsize == 32 + vzeroupper + %endif + AUTO_REP_RET +%endmacro + +%endif ;====================================================================== + +%if WIN64 == 0 + %macro WIN64_SPILL_XMM 1 + %endmacro + %macro WIN64_RESTORE_XMM 1 + %endmacro + %macro WIN64_PUSH_XMM 0 + %endmacro +%endif + +; On AMD cpus <=K10, an ordinary ret is slow if it immediately follows either +; a branch or a branch target. So switch to a 2-byte form of ret in that case. +; We can automatically detect "follows a branch", but not a branch target. +; (SSSE3 is a sufficient condition to know that your cpu doesn't have this problem.) +%macro REP_RET 0 + %if has_epilogue + RET + %else + rep ret + %endif + annotate_function_size +%endmacro + +%define last_branch_adr $$ +%macro AUTO_REP_RET 0 + %if notcpuflag(ssse3) + times ((last_branch_adr-$)>>31)+1 rep ; times 1 iff $ == last_branch_adr. + %endif + ret + annotate_function_size +%endmacro + +%macro BRANCH_INSTR 0-* + %rep %0 + %macro %1 1-2 %1 + %2 %1 + %if notcpuflag(ssse3) + %%branch_instr equ $ + %xdefine last_branch_adr %%branch_instr + %endif + %endmacro + %rotate 1 + %endrep +%endmacro + +BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae, jna, jnae, jb, jbe, jnb, jnbe, jc, jnc, js, jns, jo, jno, jp, jnp + +%macro TAIL_CALL 2 ; callee, is_nonadjacent + %if has_epilogue + call %1 + RET + %elif %2 + jmp %1 + %endif + annotate_function_size +%endmacro + +;============================================================================= +; arch-independent part +;============================================================================= + +%assign function_align 16 + +; Begin a function. +; Applies any symbol mangling needed for C linkage, and sets up a define such that +; subsequent uses of the function name automatically refer to the mangled version. +; Appends cpuflags to the function name if cpuflags has been specified. +; The "" empty default parameter is a workaround for nasm, which fails if SUFFIX +; is empty and we call cglobal_internal with just %1 %+ SUFFIX (without %2). +%macro cglobal 1-2+ "" ; name, [PROLOGUE args] + cglobal_internal 1, %1 %+ SUFFIX, %2 +%endmacro +%macro cvisible 1-2+ "" ; name, [PROLOGUE args] + cglobal_internal 0, %1 %+ SUFFIX, %2 +%endmacro +%macro cglobal_internal 2-3+ + annotate_function_size + %if %1 + %xdefine %%FUNCTION_PREFIX private_prefix + %xdefine %%VISIBILITY hidden + %else + %xdefine %%FUNCTION_PREFIX public_prefix + %xdefine %%VISIBILITY + %endif + %ifndef cglobaled_%2 + %xdefine %2 mangle(%%FUNCTION_PREFIX %+ _ %+ %2) + %xdefine %2.skip_prologue %2 %+ .skip_prologue + CAT_XDEFINE cglobaled_, %2, 1 + %endif + %xdefine current_function %2 + %xdefine current_function_section __SECT__ + %if FORMAT_ELF + global %2:function %%VISIBILITY + %else + global %2 + %endif + align function_align + %2: + RESET_MM_PERMUTATION ; needed for x86-64, also makes disassembly somewhat nicer + %xdefine rstk rsp ; copy of the original stack pointer, used when greater alignment than the known stack alignment is required + %assign stack_offset 0 ; stack pointer offset relative to the return address + %assign stack_size 0 ; amount of stack space that can be freely used inside a function + %assign stack_size_padded 0 ; total amount of allocated stack space, including space for callee-saved xmm registers on WIN64 and alignment padding + %assign xmm_regs_used 0 ; number of XMM registers requested, used for dealing with callee-saved registers on WIN64 + %ifnidn %3, "" + PROLOGUE %3 + %endif +%endmacro + +%macro cextern 1 + %xdefine %1 mangle(private_prefix %+ _ %+ %1) + CAT_XDEFINE cglobaled_, %1, 1 + extern %1 +%endmacro + +; like cextern, but without the prefix +%macro cextern_naked 1 + %ifdef PREFIX + %xdefine %1 mangle(%1) + %endif + CAT_XDEFINE cglobaled_, %1, 1 + extern %1 +%endmacro + +%macro const 1-2+ + %xdefine %1 mangle(private_prefix %+ _ %+ %1) + %if FORMAT_ELF + global %1:data hidden + %else + global %1 + %endif + %1: %2 +%endmacro + +; This is needed for ELF, otherwise the GNU linker assumes the stack is executable by default. +%if FORMAT_ELF + [SECTION .note.GNU-stack noalloc noexec nowrite progbits] +%endif + +; Tell debuggers how large the function was. +; This may be invoked multiple times per function; we rely on later instances overriding earlier ones. +; This is invoked by RET and similar macros, and also cglobal does it for the previous function, +; but if the last function in a source file doesn't use any of the standard macros for its epilogue, +; then its size might be unspecified. +%macro annotate_function_size 0 + %ifdef __YASM_VER__ + %ifdef current_function + %if FORMAT_ELF + current_function_section + %%ecf equ $ + size current_function %%ecf - current_function + __SECT__ + %endif + %endif + %endif +%endmacro + +; cpuflags + +%assign cpuflags_mmx (1<<0) +%assign cpuflags_mmx2 (1<<1) | cpuflags_mmx +%assign cpuflags_3dnow (1<<2) | cpuflags_mmx +%assign cpuflags_3dnowext (1<<3) | cpuflags_3dnow +%assign cpuflags_sse (1<<4) | cpuflags_mmx2 +%assign cpuflags_sse2 (1<<5) | cpuflags_sse +%assign cpuflags_sse2slow (1<<6) | cpuflags_sse2 +%assign cpuflags_sse3 (1<<7) | cpuflags_sse2 +%assign cpuflags_ssse3 (1<<8) | cpuflags_sse3 +%assign cpuflags_sse4 (1<<9) | cpuflags_ssse3 +%assign cpuflags_sse42 (1<<10)| cpuflags_sse4 +%assign cpuflags_avx (1<<11)| cpuflags_sse42 +%assign cpuflags_xop (1<<12)| cpuflags_avx +%assign cpuflags_fma4 (1<<13)| cpuflags_avx +%assign cpuflags_fma3 (1<<14)| cpuflags_avx +%assign cpuflags_avx2 (1<<15)| cpuflags_fma3 + +%assign cpuflags_cache32 (1<<16) +%assign cpuflags_cache64 (1<<17) +%assign cpuflags_slowctz (1<<18) +%assign cpuflags_lzcnt (1<<19) +%assign cpuflags_aligned (1<<20) ; not a cpu feature, but a function variant +%assign cpuflags_atom (1<<21) +%assign cpuflags_bmi1 (1<<22)|cpuflags_lzcnt +%assign cpuflags_bmi2 (1<<23)|cpuflags_bmi1 + +; Returns a boolean value expressing whether or not the specified cpuflag is enabled. +%define cpuflag(x) (((((cpuflags & (cpuflags_ %+ x)) ^ (cpuflags_ %+ x)) - 1) >> 31) & 1) +%define notcpuflag(x) (cpuflag(x) ^ 1) + +; Takes an arbitrary number of cpuflags from the above list. +; All subsequent functions (up to the next INIT_CPUFLAGS) is built for the specified cpu. +; You shouldn't need to invoke this macro directly, it's a subroutine for INIT_MMX &co. +%macro INIT_CPUFLAGS 0-* + %xdefine SUFFIX + %undef cpuname + %assign cpuflags 0 + + %if %0 >= 1 + %rep %0 + %ifdef cpuname + %xdefine cpuname cpuname %+ _%1 + %else + %xdefine cpuname %1 + %endif + %assign cpuflags cpuflags | cpuflags_%1 + %rotate 1 + %endrep + %xdefine SUFFIX _ %+ cpuname + + %if cpuflag(avx) + %assign avx_enabled 1 + %endif + %if (mmsize == 16 && notcpuflag(sse2)) || (mmsize == 32 && notcpuflag(avx2)) + %define mova movaps + %define movu movups + %define movnta movntps + %endif + %if cpuflag(aligned) + %define movu mova + %elif cpuflag(sse3) && notcpuflag(ssse3) + %define movu lddqu + %endif + %endif + + %if ARCH_X86_64 || cpuflag(sse2) + CPUNOP amdnop + %else + CPUNOP basicnop + %endif +%endmacro + +; Merge mmx and sse* +; m# is a simd register of the currently selected size +; xm# is the corresponding xmm register if mmsize >= 16, otherwise the same as m# +; ym# is the corresponding ymm register if mmsize >= 32, otherwise the same as m# +; (All 3 remain in sync through SWAP.) + +%macro CAT_XDEFINE 3 + %xdefine %1%2 %3 +%endmacro + +%macro CAT_UNDEF 2 + %undef %1%2 +%endmacro + +%macro INIT_MMX 0-1+ + %assign avx_enabled 0 + %define RESET_MM_PERMUTATION INIT_MMX %1 + %define mmsize 8 + %define num_mmregs 8 + %define mova movq + %define movu movq + %define movh movd + %define movnta movntq + %assign %%i 0 + %rep 8 + CAT_XDEFINE m, %%i, mm %+ %%i + CAT_XDEFINE nnmm, %%i, %%i + %assign %%i %%i+1 + %endrep + %rep 8 + CAT_UNDEF m, %%i + CAT_UNDEF nnmm, %%i + %assign %%i %%i+1 + %endrep + INIT_CPUFLAGS %1 +%endmacro + +%macro INIT_XMM 0-1+ + %assign avx_enabled 0 + %define RESET_MM_PERMUTATION INIT_XMM %1 + %define mmsize 16 + %define num_mmregs 8 + %if ARCH_X86_64 + %define num_mmregs 16 + %endif + %define mova movdqa + %define movu movdqu + %define movh movq + %define movnta movntdq + %assign %%i 0 + %rep num_mmregs + CAT_XDEFINE m, %%i, xmm %+ %%i + CAT_XDEFINE nnxmm, %%i, %%i + %assign %%i %%i+1 + %endrep + INIT_CPUFLAGS %1 +%endmacro + +%macro INIT_YMM 0-1+ + %assign avx_enabled 1 + %define RESET_MM_PERMUTATION INIT_YMM %1 + %define mmsize 32 + %define num_mmregs 8 + %if ARCH_X86_64 + %define num_mmregs 16 + %endif + %define mova movdqa + %define movu movdqu + %undef movh + %define movnta movntdq + %assign %%i 0 + %rep num_mmregs + CAT_XDEFINE m, %%i, ymm %+ %%i + CAT_XDEFINE nnymm, %%i, %%i + %assign %%i %%i+1 + %endrep + INIT_CPUFLAGS %1 +%endmacro + +INIT_XMM + +%macro DECLARE_MMCAST 1 + %define mmmm%1 mm%1 + %define mmxmm%1 mm%1 + %define mmymm%1 mm%1 + %define xmmmm%1 mm%1 + %define xmmxmm%1 xmm%1 + %define xmmymm%1 xmm%1 + %define ymmmm%1 mm%1 + %define ymmxmm%1 xmm%1 + %define ymmymm%1 ymm%1 + %define xm%1 xmm %+ m%1 + %define ym%1 ymm %+ m%1 +%endmacro + +%assign i 0 +%rep 16 + DECLARE_MMCAST i + %assign i i+1 +%endrep + +; I often want to use macros that permute their arguments. e.g. there's no +; efficient way to implement butterfly or transpose or dct without swapping some +; arguments. +; +; I would like to not have to manually keep track of the permutations: +; If I insert a permutation in the middle of a function, it should automatically +; change everything that follows. For more complex macros I may also have multiple +; implementations, e.g. the SSE2 and SSSE3 versions may have different permutations. +; +; Hence these macros. Insert a PERMUTE or some SWAPs at the end of a macro that +; permutes its arguments. It's equivalent to exchanging the contents of the +; registers, except that this way you exchange the register names instead, so it +; doesn't cost any cycles. + +%macro PERMUTE 2-* ; takes a list of pairs to swap + %rep %0/2 + %xdefine %%tmp%2 m%2 + %rotate 2 + %endrep + %rep %0/2 + %xdefine m%1 %%tmp%2 + CAT_XDEFINE nn, m%1, %1 + %rotate 2 + %endrep +%endmacro + +%macro SWAP 2+ ; swaps a single chain (sometimes more concise than pairs) + %ifnum %1 ; SWAP 0, 1, ... + SWAP_INTERNAL_NUM %1, %2 + %else ; SWAP m0, m1, ... + SWAP_INTERNAL_NAME %1, %2 + %endif +%endmacro + +%macro SWAP_INTERNAL_NUM 2-* + %rep %0-1 + %xdefine %%tmp m%1 + %xdefine m%1 m%2 + %xdefine m%2 %%tmp + CAT_XDEFINE nn, m%1, %1 + CAT_XDEFINE nn, m%2, %2 + %rotate 1 + %endrep +%endmacro + +%macro SWAP_INTERNAL_NAME 2-* + %xdefine %%args nn %+ %1 + %rep %0-1 + %xdefine %%args %%args, nn %+ %2 + %rotate 1 + %endrep + SWAP_INTERNAL_NUM %%args +%endmacro + +; If SAVE_MM_PERMUTATION is placed at the end of a function, then any later +; calls to that function will automatically load the permutation, so values can +; be returned in mmregs. +%macro SAVE_MM_PERMUTATION 0-1 + %if %0 + %xdefine %%f %1_m + %else + %xdefine %%f current_function %+ _m + %endif + %assign %%i 0 + %rep num_mmregs + CAT_XDEFINE %%f, %%i, m %+ %%i + %assign %%i %%i+1 + %endrep +%endmacro + +%macro LOAD_MM_PERMUTATION 1 ; name to load from + %ifdef %1_m0 + %assign %%i 0 + %rep num_mmregs + CAT_XDEFINE m, %%i, %1_m %+ %%i + CAT_XDEFINE nn, m %+ %%i, %%i + %assign %%i %%i+1 + %endrep + %endif +%endmacro + +; Append cpuflags to the callee's name iff the appended name is known and the plain name isn't +%macro call 1 + call_internal %1 %+ SUFFIX, %1 +%endmacro +%macro call_internal 2 + %xdefine %%i %2 + %ifndef cglobaled_%2 + %ifdef cglobaled_%1 + %xdefine %%i %1 + %endif + %endif + call %%i + LOAD_MM_PERMUTATION %%i +%endmacro + +; Substitutions that reduce instruction size but are functionally equivalent +%macro add 2 + %ifnum %2 + %if %2==128 + sub %1, -128 + %else + add %1, %2 + %endif + %else + add %1, %2 + %endif +%endmacro + +%macro sub 2 + %ifnum %2 + %if %2==128 + add %1, -128 + %else + sub %1, %2 + %endif + %else + sub %1, %2 + %endif +%endmacro + +;============================================================================= +; AVX abstraction layer +;============================================================================= + +%assign i 0 +%rep 16 + %if i < 8 + CAT_XDEFINE sizeofmm, i, 8 + %endif + CAT_XDEFINE sizeofxmm, i, 16 + CAT_XDEFINE sizeofymm, i, 32 + %assign i i+1 +%endrep +%undef i + +%macro CHECK_AVX_INSTR_EMU 3-* + %xdefine %%opcode %1 + %xdefine %%dst %2 + %rep %0-2 + %ifidn %%dst, %3 + %error non-avx emulation of ``%%opcode'' is not supported + %endif + %rotate 1 + %endrep +%endmacro + +;%1 == instruction +;%2 == minimal instruction set +;%3 == 1 if float, 0 if int +;%4 == 1 if 4-operand emulation, 0 if 3-operand emulation, 255 otherwise (no emulation) +;%5 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not +;%6+: operands +%macro RUN_AVX_INSTR 6-9+ + %ifnum sizeof%7 + %assign __sizeofreg sizeof%7 + %elifnum sizeof%6 + %assign __sizeofreg sizeof%6 + %else + %assign __sizeofreg mmsize + %endif + %assign __emulate_avx 0 + %if avx_enabled && __sizeofreg >= 16 + %xdefine __instr v%1 + %else + %xdefine __instr %1 + %if %0 >= 8+%4 + %assign __emulate_avx 1 + %endif + %endif + %ifnidn %2, fnord + %ifdef cpuname + %if notcpuflag(%2) + %error use of ``%1'' %2 instruction in cpuname function: current_function + %elif cpuflags_%2 < cpuflags_sse && notcpuflag(sse2) && __sizeofreg > 8 + %error use of ``%1'' sse2 instruction in cpuname function: current_function + %endif + %endif + %endif + + %if __emulate_avx + %xdefine __src1 %7 + %xdefine __src2 %8 + %if %5 && %4 == 0 + %ifnidn %6, %7 + %ifidn %6, %8 + %xdefine __src1 %8 + %xdefine __src2 %7 + %elifnnum sizeof%8 + ; 3-operand AVX instructions with a memory arg can only have it in src2, + ; whereas SSE emulation prefers to have it in src1 (i.e. the mov). + ; So, if the instruction is commutative with a memory arg, swap them. + %xdefine __src1 %8 + %xdefine __src2 %7 + %endif + %endif + %endif + %ifnidn %6, __src1 + %if %0 >= 9 + CHECK_AVX_INSTR_EMU {%1 %6, %7, %8, %9}, %6, __src2, %9 + %else + CHECK_AVX_INSTR_EMU {%1 %6, %7, %8}, %6, __src2 + %endif + %if __sizeofreg == 8 + MOVQ %6, __src1 + %elif %3 + MOVAPS %6, __src1 + %else + MOVDQA %6, __src1 + %endif + %endif + %if %0 >= 9 + %1 %6, __src2, %9 + %else + %1 %6, __src2 + %endif + %elif %0 >= 9 + __instr %6, %7, %8, %9 + %elif %0 == 8 + __instr %6, %7, %8 + %elif %0 == 7 + __instr %6, %7 + %else + __instr %6 + %endif +%endmacro + +;%1 == instruction +;%2 == minimal instruction set +;%3 == 1 if float, 0 if int +;%4 == 1 if 4-operand emulation, 0 if 3-operand emulation, 255 otherwise (no emulation) +;%5 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not +%macro AVX_INSTR 1-5 fnord, 0, 255, 0 + %macro %1 1-10 fnord, fnord, fnord, fnord, %1, %2, %3, %4, %5 + %ifidn %2, fnord + RUN_AVX_INSTR %6, %7, %8, %9, %10, %1 + %elifidn %3, fnord + RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2 + %elifidn %4, fnord + RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3 + %elifidn %5, fnord + RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3, %4 + %else + RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3, %4, %5 + %endif + %endmacro +%endmacro + +; Instructions with both VEX and non-VEX encodings +; Non-destructive instructions are written without parameters +AVX_INSTR addpd, sse2, 1, 0, 1 +AVX_INSTR addps, sse, 1, 0, 1 +AVX_INSTR addsd, sse2, 1, 0, 0 +AVX_INSTR addss, sse, 1, 0, 0 +AVX_INSTR addsubpd, sse3, 1, 0, 0 +AVX_INSTR addsubps, sse3, 1, 0, 0 +AVX_INSTR aesdec, fnord, 0, 0, 0 +AVX_INSTR aesdeclast, fnord, 0, 0, 0 +AVX_INSTR aesenc, fnord, 0, 0, 0 +AVX_INSTR aesenclast, fnord, 0, 0, 0 +AVX_INSTR aesimc +AVX_INSTR aeskeygenassist +AVX_INSTR andnpd, sse2, 1, 0, 0 +AVX_INSTR andnps, sse, 1, 0, 0 +AVX_INSTR andpd, sse2, 1, 0, 1 +AVX_INSTR andps, sse, 1, 0, 1 +AVX_INSTR blendpd, sse4, 1, 1, 0 +AVX_INSTR blendps, sse4, 1, 1, 0 +AVX_INSTR blendvpd, sse4 ; can't be emulated +AVX_INSTR blendvps, sse4 ; can't be emulated +AVX_INSTR cmppd, sse2, 1, 1, 0 +AVX_INSTR cmpps, sse, 1, 1, 0 +AVX_INSTR cmpsd, sse2, 1, 1, 0 +AVX_INSTR cmpss, sse, 1, 1, 0 +AVX_INSTR comisd, sse2 +AVX_INSTR comiss, sse +AVX_INSTR cvtdq2pd, sse2 +AVX_INSTR cvtdq2ps, sse2 +AVX_INSTR cvtpd2dq, sse2 +AVX_INSTR cvtpd2ps, sse2 +AVX_INSTR cvtps2dq, sse2 +AVX_INSTR cvtps2pd, sse2 +AVX_INSTR cvtsd2si, sse2 +AVX_INSTR cvtsd2ss, sse2, 1, 0, 0 +AVX_INSTR cvtsi2sd, sse2, 1, 0, 0 +AVX_INSTR cvtsi2ss, sse, 1, 0, 0 +AVX_INSTR cvtss2sd, sse2, 1, 0, 0 +AVX_INSTR cvtss2si, sse +AVX_INSTR cvttpd2dq, sse2 +AVX_INSTR cvttps2dq, sse2 +AVX_INSTR cvttsd2si, sse2 +AVX_INSTR cvttss2si, sse +AVX_INSTR divpd, sse2, 1, 0, 0 +AVX_INSTR divps, sse, 1, 0, 0 +AVX_INSTR divsd, sse2, 1, 0, 0 +AVX_INSTR divss, sse, 1, 0, 0 +AVX_INSTR dppd, sse4, 1, 1, 0 +AVX_INSTR dpps, sse4, 1, 1, 0 +AVX_INSTR extractps, sse4 +AVX_INSTR haddpd, sse3, 1, 0, 0 +AVX_INSTR haddps, sse3, 1, 0, 0 +AVX_INSTR hsubpd, sse3, 1, 0, 0 +AVX_INSTR hsubps, sse3, 1, 0, 0 +AVX_INSTR insertps, sse4, 1, 1, 0 +AVX_INSTR lddqu, sse3 +AVX_INSTR ldmxcsr, sse +AVX_INSTR maskmovdqu, sse2 +AVX_INSTR maxpd, sse2, 1, 0, 1 +AVX_INSTR maxps, sse, 1, 0, 1 +AVX_INSTR maxsd, sse2, 1, 0, 0 +AVX_INSTR maxss, sse, 1, 0, 0 +AVX_INSTR minpd, sse2, 1, 0, 1 +AVX_INSTR minps, sse, 1, 0, 1 +AVX_INSTR minsd, sse2, 1, 0, 0 +AVX_INSTR minss, sse, 1, 0, 0 +AVX_INSTR movapd, sse2 +AVX_INSTR movaps, sse +AVX_INSTR movd, mmx +AVX_INSTR movddup, sse3 +AVX_INSTR movdqa, sse2 +AVX_INSTR movdqu, sse2 +AVX_INSTR movhlps, sse, 1, 0, 0 +AVX_INSTR movhpd, sse2, 1, 0, 0 +AVX_INSTR movhps, sse, 1, 0, 0 +AVX_INSTR movlhps, sse, 1, 0, 0 +AVX_INSTR movlpd, sse2, 1, 0, 0 +AVX_INSTR movlps, sse, 1, 0, 0 +AVX_INSTR movmskpd, sse2 +AVX_INSTR movmskps, sse +AVX_INSTR movntdq, sse2 +AVX_INSTR movntdqa, sse4 +AVX_INSTR movntpd, sse2 +AVX_INSTR movntps, sse +AVX_INSTR movq, mmx +AVX_INSTR movsd, sse2, 1, 0, 0 +AVX_INSTR movshdup, sse3 +AVX_INSTR movsldup, sse3 +AVX_INSTR movss, sse, 1, 0, 0 +AVX_INSTR movupd, sse2 +AVX_INSTR movups, sse +AVX_INSTR mpsadbw, sse4, 0, 1, 0 +AVX_INSTR mulpd, sse2, 1, 0, 1 +AVX_INSTR mulps, sse, 1, 0, 1 +AVX_INSTR mulsd, sse2, 1, 0, 0 +AVX_INSTR mulss, sse, 1, 0, 0 +AVX_INSTR orpd, sse2, 1, 0, 1 +AVX_INSTR orps, sse, 1, 0, 1 +AVX_INSTR pabsb, ssse3 +AVX_INSTR pabsd, ssse3 +AVX_INSTR pabsw, ssse3 +AVX_INSTR packsswb, mmx, 0, 0, 0 +AVX_INSTR packssdw, mmx, 0, 0, 0 +AVX_INSTR packuswb, mmx, 0, 0, 0 +AVX_INSTR packusdw, sse4, 0, 0, 0 +AVX_INSTR paddb, mmx, 0, 0, 1 +AVX_INSTR paddw, mmx, 0, 0, 1 +AVX_INSTR paddd, mmx, 0, 0, 1 +AVX_INSTR paddq, sse2, 0, 0, 1 +AVX_INSTR paddsb, mmx, 0, 0, 1 +AVX_INSTR paddsw, mmx, 0, 0, 1 +AVX_INSTR paddusb, mmx, 0, 0, 1 +AVX_INSTR paddusw, mmx, 0, 0, 1 +AVX_INSTR palignr, ssse3, 0, 1, 0 +AVX_INSTR pand, mmx, 0, 0, 1 +AVX_INSTR pandn, mmx, 0, 0, 0 +AVX_INSTR pavgb, mmx2, 0, 0, 1 +AVX_INSTR pavgw, mmx2, 0, 0, 1 +AVX_INSTR pblendvb, sse4 ; can't be emulated +AVX_INSTR pblendw, sse4, 0, 1, 0 +AVX_INSTR pclmulqdq, fnord, 0, 1, 0 +AVX_INSTR pclmulhqhqdq, fnord, 0, 0, 0 +AVX_INSTR pclmulhqlqdq, fnord, 0, 0, 0 +AVX_INSTR pclmullqhqdq, fnord, 0, 0, 0 +AVX_INSTR pclmullqlqdq, fnord, 0, 0, 0 +AVX_INSTR pcmpestri, sse42 +AVX_INSTR pcmpestrm, sse42 +AVX_INSTR pcmpistri, sse42 +AVX_INSTR pcmpistrm, sse42 +AVX_INSTR pcmpeqb, mmx, 0, 0, 1 +AVX_INSTR pcmpeqw, mmx, 0, 0, 1 +AVX_INSTR pcmpeqd, mmx, 0, 0, 1 +AVX_INSTR pcmpeqq, sse4, 0, 0, 1 +AVX_INSTR pcmpgtb, mmx, 0, 0, 0 +AVX_INSTR pcmpgtw, mmx, 0, 0, 0 +AVX_INSTR pcmpgtd, mmx, 0, 0, 0 +AVX_INSTR pcmpgtq, sse42, 0, 0, 0 +AVX_INSTR pextrb, sse4 +AVX_INSTR pextrd, sse4 +AVX_INSTR pextrq, sse4 +AVX_INSTR pextrw, mmx2 +AVX_INSTR phaddw, ssse3, 0, 0, 0 +AVX_INSTR phaddd, ssse3, 0, 0, 0 +AVX_INSTR phaddsw, ssse3, 0, 0, 0 +AVX_INSTR phminposuw, sse4 +AVX_INSTR phsubw, ssse3, 0, 0, 0 +AVX_INSTR phsubd, ssse3, 0, 0, 0 +AVX_INSTR phsubsw, ssse3, 0, 0, 0 +AVX_INSTR pinsrb, sse4, 0, 1, 0 +AVX_INSTR pinsrd, sse4, 0, 1, 0 +AVX_INSTR pinsrq, sse4, 0, 1, 0 +AVX_INSTR pinsrw, mmx2, 0, 1, 0 +AVX_INSTR pmaddwd, mmx, 0, 0, 1 +AVX_INSTR pmaddubsw, ssse3, 0, 0, 0 +AVX_INSTR pmaxsb, sse4, 0, 0, 1 +AVX_INSTR pmaxsw, mmx2, 0, 0, 1 +AVX_INSTR pmaxsd, sse4, 0, 0, 1 +AVX_INSTR pmaxub, mmx2, 0, 0, 1 +AVX_INSTR pmaxuw, sse4, 0, 0, 1 +AVX_INSTR pmaxud, sse4, 0, 0, 1 +AVX_INSTR pminsb, sse4, 0, 0, 1 +AVX_INSTR pminsw, mmx2, 0, 0, 1 +AVX_INSTR pminsd, sse4, 0, 0, 1 +AVX_INSTR pminub, mmx2, 0, 0, 1 +AVX_INSTR pminuw, sse4, 0, 0, 1 +AVX_INSTR pminud, sse4, 0, 0, 1 +AVX_INSTR pmovmskb, mmx2 +AVX_INSTR pmovsxbw, sse4 +AVX_INSTR pmovsxbd, sse4 +AVX_INSTR pmovsxbq, sse4 +AVX_INSTR pmovsxwd, sse4 +AVX_INSTR pmovsxwq, sse4 +AVX_INSTR pmovsxdq, sse4 +AVX_INSTR pmovzxbw, sse4 +AVX_INSTR pmovzxbd, sse4 +AVX_INSTR pmovzxbq, sse4 +AVX_INSTR pmovzxwd, sse4 +AVX_INSTR pmovzxwq, sse4 +AVX_INSTR pmovzxdq, sse4 +AVX_INSTR pmuldq, sse4, 0, 0, 1 +AVX_INSTR pmulhrsw, ssse3, 0, 0, 1 +AVX_INSTR pmulhuw, mmx2, 0, 0, 1 +AVX_INSTR pmulhw, mmx, 0, 0, 1 +AVX_INSTR pmullw, mmx, 0, 0, 1 +AVX_INSTR pmulld, sse4, 0, 0, 1 +AVX_INSTR pmuludq, sse2, 0, 0, 1 +AVX_INSTR por, mmx, 0, 0, 1 +AVX_INSTR psadbw, mmx2, 0, 0, 1 +AVX_INSTR pshufb, ssse3, 0, 0, 0 +AVX_INSTR pshufd, sse2 +AVX_INSTR pshufhw, sse2 +AVX_INSTR pshuflw, sse2 +AVX_INSTR psignb, ssse3, 0, 0, 0 +AVX_INSTR psignw, ssse3, 0, 0, 0 +AVX_INSTR psignd, ssse3, 0, 0, 0 +AVX_INSTR psllw, mmx, 0, 0, 0 +AVX_INSTR pslld, mmx, 0, 0, 0 +AVX_INSTR psllq, mmx, 0, 0, 0 +AVX_INSTR pslldq, sse2, 0, 0, 0 +AVX_INSTR psraw, mmx, 0, 0, 0 +AVX_INSTR psrad, mmx, 0, 0, 0 +AVX_INSTR psrlw, mmx, 0, 0, 0 +AVX_INSTR psrld, mmx, 0, 0, 0 +AVX_INSTR psrlq, mmx, 0, 0, 0 +AVX_INSTR psrldq, sse2, 0, 0, 0 +AVX_INSTR psubb, mmx, 0, 0, 0 +AVX_INSTR psubw, mmx, 0, 0, 0 +AVX_INSTR psubd, mmx, 0, 0, 0 +AVX_INSTR psubq, sse2, 0, 0, 0 +AVX_INSTR psubsb, mmx, 0, 0, 0 +AVX_INSTR psubsw, mmx, 0, 0, 0 +AVX_INSTR psubusb, mmx, 0, 0, 0 +AVX_INSTR psubusw, mmx, 0, 0, 0 +AVX_INSTR ptest, sse4 +AVX_INSTR punpckhbw, mmx, 0, 0, 0 +AVX_INSTR punpckhwd, mmx, 0, 0, 0 +AVX_INSTR punpckhdq, mmx, 0, 0, 0 +AVX_INSTR punpckhqdq, sse2, 0, 0, 0 +AVX_INSTR punpcklbw, mmx, 0, 0, 0 +AVX_INSTR punpcklwd, mmx, 0, 0, 0 +AVX_INSTR punpckldq, mmx, 0, 0, 0 +AVX_INSTR punpcklqdq, sse2, 0, 0, 0 +AVX_INSTR pxor, mmx, 0, 0, 1 +AVX_INSTR rcpps, sse +AVX_INSTR rcpss, sse, 1, 0, 0 +AVX_INSTR roundpd, sse4 +AVX_INSTR roundps, sse4 +AVX_INSTR roundsd, sse4, 1, 1, 0 +AVX_INSTR roundss, sse4, 1, 1, 0 +AVX_INSTR rsqrtps, sse +AVX_INSTR rsqrtss, sse, 1, 0, 0 +AVX_INSTR shufpd, sse2, 1, 1, 0 +AVX_INSTR shufps, sse, 1, 1, 0 +AVX_INSTR sqrtpd, sse2 +AVX_INSTR sqrtps, sse +AVX_INSTR sqrtsd, sse2, 1, 0, 0 +AVX_INSTR sqrtss, sse, 1, 0, 0 +AVX_INSTR stmxcsr, sse +AVX_INSTR subpd, sse2, 1, 0, 0 +AVX_INSTR subps, sse, 1, 0, 0 +AVX_INSTR subsd, sse2, 1, 0, 0 +AVX_INSTR subss, sse, 1, 0, 0 +AVX_INSTR ucomisd, sse2 +AVX_INSTR ucomiss, sse +AVX_INSTR unpckhpd, sse2, 1, 0, 0 +AVX_INSTR unpckhps, sse, 1, 0, 0 +AVX_INSTR unpcklpd, sse2, 1, 0, 0 +AVX_INSTR unpcklps, sse, 1, 0, 0 +AVX_INSTR xorpd, sse2, 1, 0, 1 +AVX_INSTR xorps, sse, 1, 0, 1 + +; 3DNow instructions, for sharing code between AVX, SSE and 3DN +AVX_INSTR pfadd, 3dnow, 1, 0, 1 +AVX_INSTR pfsub, 3dnow, 1, 0, 0 +AVX_INSTR pfmul, 3dnow, 1, 0, 1 + +; base-4 constants for shuffles +%assign i 0 +%rep 256 + %assign j ((i>>6)&3)*1000 + ((i>>4)&3)*100 + ((i>>2)&3)*10 + (i&3) + %if j < 10 + CAT_XDEFINE q000, j, i + %elif j < 100 + CAT_XDEFINE q00, j, i + %elif j < 1000 + CAT_XDEFINE q0, j, i + %else + CAT_XDEFINE q, j, i + %endif + %assign i i+1 +%endrep +%undef i +%undef j + +%macro FMA_INSTR 3 + %macro %1 4-7 %1, %2, %3 + %if cpuflag(xop) + v%5 %1, %2, %3, %4 + %elifnidn %1, %4 + %6 %1, %2, %3 + %7 %1, %4 + %else + %error non-xop emulation of ``%5 %1, %2, %3, %4'' is not supported + %endif + %endmacro +%endmacro + +FMA_INSTR pmacsww, pmullw, paddw +FMA_INSTR pmacsdd, pmulld, paddd ; sse4 emulation +FMA_INSTR pmacsdql, pmuldq, paddq ; sse4 emulation +FMA_INSTR pmadcswd, pmaddwd, paddd + +; tzcnt is equivalent to "rep bsf" and is backwards-compatible with bsf. +; This lets us use tzcnt without bumping the yasm version requirement yet. +%define tzcnt rep bsf + +; Macros for consolidating FMA3 and FMA4 using 4-operand (dst, src1, src2, src3) syntax. +; FMA3 is only possible if dst is the same as one of the src registers. +; Either src2 or src3 can be a memory operand. +%macro FMA4_INSTR 2-* + %push fma4_instr + %xdefine %$prefix %1 + %rep %0 - 1 + %macro %$prefix%2 4-6 %$prefix, %2 + %if notcpuflag(fma3) && notcpuflag(fma4) + %error use of ``%5%6'' fma instruction in cpuname function: current_function + %elif cpuflag(fma4) + v%5%6 %1, %2, %3, %4 + %elifidn %1, %2 + ; If %3 or %4 is a memory operand it needs to be encoded as the last operand. + %ifnum sizeof%3 + v%{5}213%6 %2, %3, %4 + %else + v%{5}132%6 %2, %4, %3 + %endif + %elifidn %1, %3 + v%{5}213%6 %3, %2, %4 + %elifidn %1, %4 + v%{5}231%6 %4, %2, %3 + %else + %error fma3 emulation of ``%5%6 %1, %2, %3, %4'' is not supported + %endif + %endmacro + %rotate 1 + %endrep + %pop +%endmacro + +FMA4_INSTR fmadd, pd, ps, sd, ss +FMA4_INSTR fmaddsub, pd, ps +FMA4_INSTR fmsub, pd, ps, sd, ss +FMA4_INSTR fmsubadd, pd, ps +FMA4_INSTR fnmadd, pd, ps, sd, ss +FMA4_INSTR fnmsub, pd, ps, sd, ss + +; workaround: vpbroadcastq is broken in x86_32 due to a yasm bug (fixed in 1.3.0) +%ifdef __YASM_VER__ + %if __YASM_VERSION_ID__ < 0x01030000 && ARCH_X86_64 == 0 + %macro vpbroadcastq 2 + %if sizeof%1 == 16 + movddup %1, %2 + %else + vbroadcastsd %1, %2 + %endif + %endmacro + %endif +%endif diff --git a/x86util.asm b/x86util.asm new file mode 100644 index 0000000..5f5f87a --- /dev/null +++ b/x86util.asm @@ -0,0 +1,695 @@ +;***************************************************************************** +;* x86util.asm +;***************************************************************************** +;* Copyright (C) 2008-2010 x264 project +;* +;* Authors: Loren Merritt <lorenm@u.washington.edu> +;* Holger Lubitz <holger@lubitz.org> +;* +;* This file is part of Libav. +;* +;* Libav is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* Libav is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with Libav; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%define private_prefix tdi +%define public_prefix tdi +%define cpuflags_mmxext cpuflags_mmx2 + +%include "config.asm" + +%include "x86inc.asm" + +%macro SBUTTERFLY 4 +%if avx_enabled == 0 + mova m%4, m%2 + punpckl%1 m%2, m%3 + punpckh%1 m%4, m%3 +%else + punpckh%1 m%4, m%2, m%3 + punpckl%1 m%2, m%3 +%endif + SWAP %3, %4 +%endmacro + +%macro SBUTTERFLY2 4 + punpckl%1 m%4, m%2, m%3 + punpckh%1 m%2, m%2, m%3 + SWAP %2, %4, %3 +%endmacro + +%macro SBUTTERFLYPS 3 + unpcklps m%3, m%1, m%2 + unpckhps m%1, m%1, m%2 + SWAP %1, %3, %2 +%endmacro + +%macro TRANSPOSE4x4B 5 + SBUTTERFLY bw, %1, %2, %5 + SBUTTERFLY bw, %3, %4, %5 + SBUTTERFLY wd, %1, %3, %5 + SBUTTERFLY wd, %2, %4, %5 + SWAP %2, %3 +%endmacro + +%macro TRANSPOSE4x4W 5 + SBUTTERFLY wd, %1, %2, %5 + SBUTTERFLY wd, %3, %4, %5 + SBUTTERFLY dq, %1, %3, %5 + SBUTTERFLY dq, %2, %4, %5 + SWAP %2, %3 +%endmacro + +%macro TRANSPOSE2x4x4W 5 + SBUTTERFLY wd, %1, %2, %5 + SBUTTERFLY wd, %3, %4, %5 + SBUTTERFLY dq, %1, %3, %5 + SBUTTERFLY dq, %2, %4, %5 + SBUTTERFLY qdq, %1, %2, %5 + SBUTTERFLY qdq, %3, %4, %5 +%endmacro + +%macro TRANSPOSE4x4D 5 + SBUTTERFLY dq, %1, %2, %5 + SBUTTERFLY dq, %3, %4, %5 + SBUTTERFLY qdq, %1, %3, %5 + SBUTTERFLY qdq, %2, %4, %5 + SWAP %2, %3 +%endmacro + +; identical behavior to TRANSPOSE4x4D, but using SSE1 float ops +%macro TRANSPOSE4x4PS 5 + SBUTTERFLYPS %1, %2, %5 + SBUTTERFLYPS %3, %4, %5 + movlhps m%5, m%1, m%3 + movhlps m%3, m%1 + SWAP %5, %1 + movlhps m%5, m%2, m%4 + movhlps m%4, m%2 + SWAP %5, %2, %3 +%endmacro + +%macro TRANSPOSE8x8W 9-11 +%if ARCH_X86_64 + SBUTTERFLY wd, %1, %2, %9 + SBUTTERFLY wd, %3, %4, %9 + SBUTTERFLY wd, %5, %6, %9 + SBUTTERFLY wd, %7, %8, %9 + SBUTTERFLY dq, %1, %3, %9 + SBUTTERFLY dq, %2, %4, %9 + SBUTTERFLY dq, %5, %7, %9 + SBUTTERFLY dq, %6, %8, %9 + SBUTTERFLY qdq, %1, %5, %9 + SBUTTERFLY qdq, %2, %6, %9 + SBUTTERFLY qdq, %3, %7, %9 + SBUTTERFLY qdq, %4, %8, %9 + SWAP %2, %5 + SWAP %4, %7 +%else +; in: m0..m7, unless %11 in which case m6 is in %9 +; out: m0..m7, unless %11 in which case m4 is in %10 +; spills into %9 and %10 +%if %0<11 + movdqa %9, m%7 +%endif + SBUTTERFLY wd, %1, %2, %7 + movdqa %10, m%2 + movdqa m%7, %9 + SBUTTERFLY wd, %3, %4, %2 + SBUTTERFLY wd, %5, %6, %2 + SBUTTERFLY wd, %7, %8, %2 + SBUTTERFLY dq, %1, %3, %2 + movdqa %9, m%3 + movdqa m%2, %10 + SBUTTERFLY dq, %2, %4, %3 + SBUTTERFLY dq, %5, %7, %3 + SBUTTERFLY dq, %6, %8, %3 + SBUTTERFLY qdq, %1, %5, %3 + SBUTTERFLY qdq, %2, %6, %3 + movdqa %10, m%2 + movdqa m%3, %9 + SBUTTERFLY qdq, %3, %7, %2 + SBUTTERFLY qdq, %4, %8, %2 + SWAP %2, %5 + SWAP %4, %7 +%if %0<11 + movdqa m%5, %10 +%endif +%endif +%endmacro + +; PABSW macro assumes %1 != %2, while ABS1/2 macros work in-place +%macro PABSW 2 +%if cpuflag(ssse3) + pabsw %1, %2 +%elif cpuflag(mmxext) + pxor %1, %1 + psubw %1, %2 + pmaxsw %1, %2 +%else + pxor %1, %1 + pcmpgtw %1, %2 + pxor %2, %1 + psubw %2, %1 + SWAP %1, %2 +%endif +%endmacro + +%macro PSIGNW_MMX 2 + pxor %1, %2 + psubw %1, %2 +%endmacro + +%macro PSIGNW_SSSE3 2 + psignw %1, %2 +%endmacro + +%macro ABS1 2 +%if cpuflag(ssse3) + pabsw %1, %1 +%elif cpuflag(mmxext) ; a, tmp + pxor %2, %2 + psubw %2, %1 + pmaxsw %1, %2 +%else ; a, tmp + pxor %2, %2 + pcmpgtw %2, %1 + pxor %1, %2 + psubw %1, %2 +%endif +%endmacro + +%macro ABS2 4 +%if cpuflag(ssse3) + pabsw %1, %1 + pabsw %2, %2 +%elif cpuflag(mmxext) ; a, b, tmp0, tmp1 + pxor %3, %3 + pxor %4, %4 + psubw %3, %1 + psubw %4, %2 + pmaxsw %1, %3 + pmaxsw %2, %4 +%else ; a, b, tmp0, tmp1 + pxor %3, %3 + pxor %4, %4 + pcmpgtw %3, %1 + pcmpgtw %4, %2 + pxor %1, %3 + pxor %2, %4 + psubw %1, %3 + psubw %2, %4 +%endif +%endmacro + +%macro ABSB 2 ; source mmreg, temp mmreg (unused for ssse3) +%if cpuflag(ssse3) + pabsb %1, %1 +%else + pxor %2, %2 + psubb %2, %1 + pminub %1, %2 +%endif +%endmacro + +%macro ABSB2 4 ; src1, src2, tmp1, tmp2 (tmp1/2 unused for SSSE3) +%if cpuflag(ssse3) + pabsb %1, %1 + pabsb %2, %2 +%else + pxor %3, %3 + pxor %4, %4 + psubb %3, %1 + psubb %4, %2 + pminub %1, %3 + pminub %2, %4 +%endif +%endmacro + +%macro ABSD2_MMX 4 + pxor %3, %3 + pxor %4, %4 + pcmpgtd %3, %1 + pcmpgtd %4, %2 + pxor %1, %3 + pxor %2, %4 + psubd %1, %3 + psubd %2, %4 +%endmacro + +%macro ABS4 6 + ABS2 %1, %2, %5, %6 + ABS2 %3, %4, %5, %6 +%endmacro + +%macro SPLATB_LOAD 3 +%if cpuflag(ssse3) + movd %1, [%2-3] + pshufb %1, %3 +%else + movd %1, [%2-3] ;to avoid crossing a cacheline + punpcklbw %1, %1 + SPLATW %1, %1, 3 +%endif +%endmacro + +%macro SPLATB_REG 3 +%if cpuflag(ssse3) + movd %1, %2d + pshufb %1, %3 +%else + movd %1, %2d + punpcklbw %1, %1 + SPLATW %1, %1, 0 +%endif +%endmacro + +%macro PALIGNR 4-5 +%if cpuflag(ssse3) +%if %0==5 + palignr %1, %2, %3, %4 +%else + palignr %1, %2, %3 +%endif +%elif cpuflag(mmx) ; [dst,] src1, src2, imm, tmp + %define %%dst %1 +%if %0==5 +%ifnidn %1, %2 + mova %%dst, %2 +%endif + %rotate 1 +%endif +%ifnidn %4, %2 + mova %4, %2 +%endif +%if mmsize==8 + psllq %%dst, (8-%3)*8 + psrlq %4, %3*8 +%else + pslldq %%dst, 16-%3 + psrldq %4, %3 +%endif + por %%dst, %4 +%endif +%endmacro + +%macro PAVGB 2 +%if cpuflag(mmxext) + pavgb %1, %2 +%elif cpuflag(3dnow) + pavgusb %1, %2 +%endif +%endmacro + +%macro PSHUFLW 1+ + %if mmsize == 8 + pshufw %1 + %else + pshuflw %1 + %endif +%endmacro + +%macro PSWAPD 2 +%if cpuflag(mmxext) + pshufw %1, %2, q1032 +%elif cpuflag(3dnowext) + pswapd %1, %2 +%elif cpuflag(3dnow) + movq %1, %2 + psrlq %1, 32 + punpckldq %1, %2 +%endif +%endmacro + +%macro DEINTB 5 ; mask, reg1, mask, reg2, optional src to fill masks from +%ifnum %5 + pand m%3, m%5, m%4 ; src .. y6 .. y4 + pand m%1, m%5, m%2 ; dst .. y6 .. y4 +%else + mova m%1, %5 + pand m%3, m%1, m%4 ; src .. y6 .. y4 + pand m%1, m%1, m%2 ; dst .. y6 .. y4 +%endif + psrlw m%2, 8 ; dst .. y7 .. y5 + psrlw m%4, 8 ; src .. y7 .. y5 +%endmacro + +%macro SUMSUB_BA 3-4 +%if %0==3 + padd%1 m%2, m%3 + padd%1 m%3, m%3 + psub%1 m%3, m%2 +%else +%if avx_enabled == 0 + mova m%4, m%2 + padd%1 m%2, m%3 + psub%1 m%3, m%4 +%else + padd%1 m%4, m%2, m%3 + psub%1 m%3, m%2 + SWAP %2, %4 +%endif +%endif +%endmacro + +%macro SUMSUB_BADC 5-6 +%if %0==6 + SUMSUB_BA %1, %2, %3, %6 + SUMSUB_BA %1, %4, %5, %6 +%else + padd%1 m%2, m%3 + padd%1 m%4, m%5 + padd%1 m%3, m%3 + padd%1 m%5, m%5 + psub%1 m%3, m%2 + psub%1 m%5, m%4 +%endif +%endmacro + +%macro SUMSUB2_AB 4 +%ifnum %3 + psub%1 m%4, m%2, m%3 + psub%1 m%4, m%3 + padd%1 m%2, m%2 + padd%1 m%2, m%3 +%else + mova m%4, m%2 + padd%1 m%2, m%2 + padd%1 m%2, %3 + psub%1 m%4, %3 + psub%1 m%4, %3 +%endif +%endmacro + +%macro SUMSUB2_BA 4 +%if avx_enabled == 0 + mova m%4, m%2 + padd%1 m%2, m%3 + padd%1 m%2, m%3 + psub%1 m%3, m%4 + psub%1 m%3, m%4 +%else + padd%1 m%4, m%2, m%3 + padd%1 m%4, m%3 + psub%1 m%3, m%2 + psub%1 m%3, m%2 + SWAP %2, %4 +%endif +%endmacro + +%macro SUMSUBD2_AB 5 +%ifnum %4 + psra%1 m%5, m%2, 1 ; %3: %3>>1 + psra%1 m%4, m%3, 1 ; %2: %2>>1 + padd%1 m%4, m%2 ; %3: %3>>1+%2 + psub%1 m%5, m%3 ; %2: %2>>1-%3 + SWAP %2, %5 + SWAP %3, %4 +%else + mova %5, m%2 + mova %4, m%3 + psra%1 m%3, 1 ; %3: %3>>1 + psra%1 m%2, 1 ; %2: %2>>1 + padd%1 m%3, %5 ; %3: %3>>1+%2 + psub%1 m%2, %4 ; %2: %2>>1-%3 +%endif +%endmacro + +%macro DCT4_1D 5 +%ifnum %5 + SUMSUB_BADC w, %4, %1, %3, %2, %5 + SUMSUB_BA w, %3, %4, %5 + SUMSUB2_AB w, %1, %2, %5 + SWAP %1, %3, %4, %5, %2 +%else + SUMSUB_BADC w, %4, %1, %3, %2 + SUMSUB_BA w, %3, %4 + mova [%5], m%2 + SUMSUB2_AB w, %1, [%5], %2 + SWAP %1, %3, %4, %2 +%endif +%endmacro + +%macro IDCT4_1D 6-7 +%ifnum %6 + SUMSUBD2_AB %1, %3, %5, %7, %6 + ; %3: %3>>1-%5 %5: %3+%5>>1 + SUMSUB_BA %1, %4, %2, %7 + ; %4: %2+%4 %2: %2-%4 + SUMSUB_BADC %1, %5, %4, %3, %2, %7 + ; %5: %2+%4 + (%3+%5>>1) + ; %4: %2+%4 - (%3+%5>>1) + ; %3: %2-%4 + (%3>>1-%5) + ; %2: %2-%4 - (%3>>1-%5) +%else +%ifidn %1, w + SUMSUBD2_AB %1, %3, %5, [%6], [%6+16] +%else + SUMSUBD2_AB %1, %3, %5, [%6], [%6+32] +%endif + SUMSUB_BA %1, %4, %2 + SUMSUB_BADC %1, %5, %4, %3, %2 +%endif + SWAP %2, %5, %4 + ; %2: %2+%4 + (%3+%5>>1) row0 + ; %3: %2-%4 + (%3>>1-%5) row1 + ; %4: %2-%4 - (%3>>1-%5) row2 + ; %5: %2+%4 - (%3+%5>>1) row3 +%endmacro + + +%macro LOAD_DIFF 5 +%ifidn %3, none + movh %1, %4 + movh %2, %5 + punpcklbw %1, %2 + punpcklbw %2, %2 + psubw %1, %2 +%else + movh %1, %4 + punpcklbw %1, %3 + movh %2, %5 + punpcklbw %2, %3 + psubw %1, %2 +%endif +%endmacro + +%macro STORE_DCT 6 + movq [%5+%6+ 0], m%1 + movq [%5+%6+ 8], m%2 + movq [%5+%6+16], m%3 + movq [%5+%6+24], m%4 + movhps [%5+%6+32], m%1 + movhps [%5+%6+40], m%2 + movhps [%5+%6+48], m%3 + movhps [%5+%6+56], m%4 +%endmacro + +%macro LOAD_DIFF_8x4P 7-10 r0,r2,0 ; 4x dest, 2x temp, 2x pointer, increment? + LOAD_DIFF m%1, m%5, m%7, [%8], [%9] + LOAD_DIFF m%2, m%6, m%7, [%8+r1], [%9+r3] + LOAD_DIFF m%3, m%5, m%7, [%8+2*r1], [%9+2*r3] + LOAD_DIFF m%4, m%6, m%7, [%8+r4], [%9+r5] +%if %10 + lea %8, [%8+4*r1] + lea %9, [%9+4*r3] +%endif +%endmacro + +%macro DIFFx2 6-7 + movh %3, %5 + punpcklbw %3, %4 + psraw %1, 6 + paddsw %1, %3 + movh %3, %6 + punpcklbw %3, %4 + psraw %2, 6 + paddsw %2, %3 + packuswb %2, %1 +%endmacro + +%macro STORE_DIFF 4 + movh %2, %4 + punpcklbw %2, %3 + psraw %1, 6 + paddsw %1, %2 + packuswb %1, %1 + movh %4, %1 +%endmacro + +%macro STORE_DIFFx2 8 ; add1, add2, reg1, reg2, zero, shift, source, stride + movh %3, [%7] + movh %4, [%7+%8] + psraw %1, %6 + psraw %2, %6 + punpcklbw %3, %5 + punpcklbw %4, %5 + paddw %3, %1 + paddw %4, %2 + packuswb %3, %5 + packuswb %4, %5 + movh [%7], %3 + movh [%7+%8], %4 +%endmacro + +%macro PMINUB 3 ; dst, src, ignored +%if cpuflag(mmxext) + pminub %1, %2 +%else ; dst, src, tmp + mova %3, %1 + psubusb %3, %2 + psubb %1, %3 +%endif +%endmacro + +%macro SPLATW 2-3 0 +%if mmsize == 16 + pshuflw %1, %2, (%3)*0x55 + punpcklqdq %1, %1 +%elif cpuflag(mmxext) + pshufw %1, %2, (%3)*0x55 +%else + %ifnidn %1, %2 + mova %1, %2 + %endif + %if %3 & 2 + punpckhwd %1, %1 + %else + punpcklwd %1, %1 + %endif + %if %3 & 1 + punpckhwd %1, %1 + %else + punpcklwd %1, %1 + %endif +%endif +%endmacro + +%macro SPLATD 1 +%if mmsize == 8 + punpckldq %1, %1 +%elif cpuflag(sse2) + pshufd %1, %1, 0 +%elif cpuflag(sse) + shufps %1, %1, 0 +%endif +%endmacro + +%macro CLIPW 3 ;(dst, min, max) + pmaxsw %1, %2 + pminsw %1, %3 +%endmacro + +%macro PMINSD_MMX 3 ; dst, src, tmp + mova %3, %2 + pcmpgtd %3, %1 + pxor %1, %2 + pand %1, %3 + pxor %1, %2 +%endmacro + +%macro PMAXSD_MMX 3 ; dst, src, tmp + mova %3, %1 + pcmpgtd %3, %2 + pand %1, %3 + pandn %3, %2 + por %1, %3 +%endmacro + +%macro CLIPD_MMX 3-4 ; src/dst, min, max, tmp + PMINSD_MMX %1, %3, %4 + PMAXSD_MMX %1, %2, %4 +%endmacro + +%macro CLIPD_SSE2 3-4 ; src/dst, min (float), max (float), unused + cvtdq2ps %1, %1 + minps %1, %3 + maxps %1, %2 + cvtps2dq %1, %1 +%endmacro + +%macro CLIPD_SSE41 3-4 ; src/dst, min, max, unused + pminsd %1, %3 + pmaxsd %1, %2 +%endmacro + +%macro VBROADCASTSS 2 ; dst xmm/ymm, src m32 +%if cpuflag(avx) + vbroadcastss %1, %2 +%else ; sse + movss %1, %2 + shufps %1, %1, 0 +%endif +%endmacro + +%macro VBROADCASTSD 2 ; dst xmm/ymm, src m64 +%if cpuflag(avx) && mmsize == 32 + vbroadcastsd %1, %2 +%elif cpuflag(sse3) + movddup %1, %2 +%else ; sse2 + movsd %1, %2 + movlhps %1, %1 +%endif +%endmacro + +%macro SHUFFLE_MASK_W 8 + %rep 8 + %if %1>=0x80 + db %1, %1 + %else + db %1*2 + db %1*2+1 + %endif + %rotate 1 + %endrep +%endmacro + +%macro PMOVSXWD 2; dst, src +%if cpuflag(sse4) + pmovsxwd %1, %2 +%else + %ifnidn %1, %2 + mova %1, %2 + %endif + punpcklwd %1, %1 + psrad %1, 16 +%endif +%endmacro + +; Wrapper for non-FMA version of fmaddps +%macro FMULADD_PS 5 + %if cpuflag(fma3) || cpuflag(fma4) + fmaddps %1, %2, %3, %4 + %elifidn %1, %4 + mulps %5, %2, %3 + addps %1, %4, %5 + %else + mulps %1, %2, %3 + addps %1, %4 + %endif +%endmacro + +; Wrapper for non-FMA version of fmaddpd +%macro FMULADD_PD 5 + %if cpuflag(fma3) || cpuflag(fma4) + fmaddpd %1, %2, %3, %4 + %elifidn %1, %4 + mulpd %5, %2, %3 + addpd %1, %4, %5 + %else + mulpd %1, %2, %3 + addpd %1, %4 + %endif +%endmacro |