From cb40d04fa6c5d6105adc42278b13954ee003d0e9 Mon Sep 17 00:00:00 2001 From: Lars-Dominik Braun Date: Thu, 12 Feb 2015 16:36:05 +0100 Subject: Vectorize flam3_iterate --- flam3.c | 76 +++++++++++++++++++++++++++++++-------------------------------- flam3.h | 3 +-- private.h | 2 +- rect.c | 22 +++++++++--------- 4 files changed, 51 insertions(+), 52 deletions(-) diff --git a/flam3.c b/flam3.c index 6eaedcb..2b4976f 100644 --- a/flam3.c +++ b/flam3.c @@ -227,7 +227,7 @@ int flam3_create_chaos_distrib(flam3_genome *cp, int xi, unsigned short *xform_d */ -int flam3_iterate(flam3_genome *cp, int n, int fuse, double *samples, unsigned short *xform_distrib, randctx *rc) { +int flam3_iterate(flam3_genome *cp, int n, int fuse, const double4 in, double4 *samples, unsigned short *xform_distrib, randctx *rc) { int i; double4 p, q; int consec = 0; @@ -235,13 +235,13 @@ int flam3_iterate(flam3_genome *cp, int n, int fuse, double *samples, unsigned int lastxf=0; int fn; - p = (double4) { samples[0], samples[1], samples[2], samples[3] }; + p = in; /* Perform precalculations */ for (i=0;inum_xforms;i++) xform_precalc(cp,i); - for (i = -4*fuse; i < 4*n; i+=4) { + for (i = fuse; i < n; i++) { // fn = xform_distrib[ lastxf*CHOOSE_XFORM_GRAIN + (((unsigned)irand(rc)) % CHOOSE_XFORM_GRAIN)]; if (cp->chaos_enable) @@ -254,7 +254,7 @@ int flam3_iterate(flam3_genome *cp, int n, int fuse, double *samples, unsigned badvals ++; if (consec<5) { p = q; - i -= 4; + --i; continue; } else consec = 0; @@ -277,10 +277,7 @@ int flam3_iterate(flam3_genome *cp, int n, int fuse, double *samples, unsigned /* if fuse over, store it */ if (i >= 0) { - samples[i] = q[0]; - samples[i+1] = q[1]; - samples[i+2] = q[2]; - samples[i+3] = q[3]; + samples[i] = q; } } @@ -339,13 +336,16 @@ int flam3_xform_preview(flam3_genome *cp, int xi, double range, int numvals, int } #endif +#if 0 int flam3_colorhist(flam3_genome *cp, int num_batches, randctx *rc, double *hist) { int lp,plp; int mycolor; unsigned short *xform_distrib; int sbs = 10000; - double sub_batch[4*10000]; + double4 *sub_batch; + + sub_batch = malloc (sizeof (*sub_batch) * sbs); memset(hist,0,256*sizeof(double)); @@ -357,15 +357,12 @@ int flam3_colorhist(flam3_genome *cp, int num_batches, randctx *rc, double *hist for (lp=0;lp b[0]) return 1; return 0; } static int sort_by_y(const void *av, const void *bv) { - double *a = (double *) av; - double *b = (double *) bv; + double4 a = *((double4 *) av); + double4 b = *((double4 *) bv); if (a[1] < b[1]) return -1; if (a[1] > b[1]) return 1; return 0; @@ -3527,24 +3526,21 @@ int flam3_estimate_bounding_box(flam3_genome *cp, double eps, int nsamples, int i; int low_target, high_target; double min[2], max[2]; - double *points; + double4 *points; int bv; unsigned short *xform_distrib; if (nsamples <= 0) nsamples = 10000; - points = (double *) malloc(sizeof(double) * 4 * nsamples); - points[0] = flam3_random_isaac_11(rc); - points[1] = flam3_random_isaac_11(rc); - points[2] = 0.0; - points[3] = 0.0; + points = (double4 *) malloc(sizeof(double4) * nsamples); + const double4 start = (double4) { flam3_random_isaac_11(rc), flam3_random_isaac_11(rc), 0.0, 0.0 }; if (prepare_precalc_flags(cp)) return(-1); xform_distrib = flam3_create_xform_distrib(cp); if (xform_distrib==NULL) return(-1); - bv=flam3_iterate(cp, nsamples, 20, points, xform_distrib, rc); + bv=flam3_iterate(cp, nsamples, 20, start, points, xform_distrib, rc); free(xform_distrib); if ( bv/(double)nsamples > eps ) @@ -3561,7 +3557,7 @@ int flam3_estimate_bounding_box(flam3_genome *cp, double eps, int nsamples, max[0] = max[1] = -1e10; for (i = 0; i < nsamples; i++) { - double *p = &points[4*i]; + const double4 p = points[i]; if (p[0] < min[0]) min[0] = p[0]; if (p[1] < min[1]) min[1] = p[1]; if (p[0] > max[0]) max[0] = p[0]; @@ -3577,13 +3573,13 @@ int flam3_estimate_bounding_box(flam3_genome *cp, double eps, int nsamples, return(bv); } - qsort(points, nsamples, sizeof(double) * 4, sort_by_x); - bmin[0] = points[4 * low_target]; - bmax[0] = points[4 * high_target]; + qsort(points, nsamples, sizeof(double4), sort_by_x); + bmin[0] = points[low_target][0]; + bmax[0] = points[high_target][0]; - qsort(points, nsamples, sizeof(double) * 4, sort_by_y); - bmin[1] = points[4 * low_target + 1]; - bmax[1] = points[4 * high_target + 1]; + qsort(points, nsamples, sizeof(double4), sort_by_y); + bmin[1] = points[low_target][1]; + bmax[1] = points[high_target][1]; free(points); return(bv); @@ -3895,7 +3891,7 @@ void flam3_srandom() { srandom(seed); } - +#if 0 /* correlation dimension, after clint sprott. computes slope of the correlation sum at a size scale the order of 2% the size of the attractor or the camera. */ @@ -3941,20 +3937,17 @@ double flam3_dimension(flam3_genome *cp, int ntries, int clip_to_camera) { got = 0; nclipped = 0; + double4 *subb = malloc (sizeof (*subb) * SBS); while (got < 2*ntries) { - double subb[40000]; int i4, clipped; unsigned short *xform_distrib; - subb[0] = flam3_random_isaac_11(&rc); - subb[1] = flam3_random_isaac_11(&rc); - subb[2] = 0.0; - subb[3] = 0.0; + const double4 start = (double4) { flam3_random_isaac_11(&rc), flam3_random_isaac_11(&rc), 0.0, 0.0 }; if (prepare_precalc_flags(cp)) return(-1.0); xform_distrib = flam3_create_xform_distrib(cp); if (xform_distrib==NULL) return(-1.0); - flam3_iterate(cp, SBS, 20, subb, xform_distrib, &rc); + flam3_iterate(cp, SBS, 20, start, subb, xform_distrib, &rc); free(xform_distrib); i4 = 0; for (i = 0; i < SBS; i++) { @@ -3973,12 +3966,14 @@ double flam3_dimension(flam3_genome *cp, int ntries, int clip_to_camera) { if (nclipped > 10 * ntries) { fprintf(stderr, "warning: too much clipping, " "flam3_dimension giving up.\n"); + free (subb); return sqrt(-1.0); } } i4 += 4; } } + free (subb); if (0) fprintf(stderr, "cliprate=%g\n", nclipped/(ntries+(double)nclipped)); @@ -4009,7 +4004,9 @@ double flam3_dimension(flam3_genome *cp, int ntries, int clip_to_camera) { free(hist); return fd; } +#endif +#if 0 double flam3_lyapunov(flam3_genome *cp, int ntries) { double p[4]; double x, y; @@ -4100,4 +4097,5 @@ double flam3_lyapunov(flam3_genome *cp, int ntries) { } return sum/(log(2.0)*ntries); } +#endif diff --git a/flam3.h b/flam3.h index 4135701..611cb22 100644 --- a/flam3.h +++ b/flam3.h @@ -549,8 +549,7 @@ void clear_cp(flam3_genome *cp, int def_flag); (samples[2], samples[3]) as starting color coordinate, perform fuse iterations and throw them away, then perform nsamples iterations and save them in the samples array */ -EXPORT int flam3_iterate(flam3_genome *g, int nsamples, int fuse, double *samples, - unsigned short *xform_distrib, randctx *rc); +EXPORT int flam3_iterate(flam3_genome *cp, int n, int fuse, const double4 in, double4 *samples, unsigned short *xform_distrib, randctx *rc); void apply_motion_parameters(flam3_xform *xf, flam3_xform *addto, double blend); diff --git a/private.h b/private.h index f34599b..c33a099 100644 --- a/private.h +++ b/private.h @@ -102,7 +102,7 @@ typedef struct { typedef struct { - double *iter_storage; /* Storage for iteration coordinates */ + double4 *iter_storage; /* Storage for iteration coordinates */ randctx rc; /* Thread-unique ISAAC seed */ flam3_genome cp; /* Full copy of genome for use by the thread */ int first_thread; diff --git a/rect.c b/rect.c index 0b18993..30d76fd 100644 --- a/rect.c +++ b/rect.c @@ -390,13 +390,15 @@ static void iter_thread(void *fth) { } /* Seed iterations */ - fthp->iter_storage[0] = flam3_random_isaac_11(&(fthp->rc)); - fthp->iter_storage[1] = flam3_random_isaac_11(&(fthp->rc)); - fthp->iter_storage[2] = flam3_random_isaac_01(&(fthp->rc)); - fthp->iter_storage[3] = flam3_random_isaac_01(&(fthp->rc)); + const double4 start = (double4) { + flam3_random_isaac_11(&(fthp->rc)), + flam3_random_isaac_11(&(fthp->rc)), + flam3_random_isaac_01(&(fthp->rc)), + flam3_random_isaac_01(&(fthp->rc)), + }; /* Execute iterations */ - badcount = flam3_iterate(&(fthp->cp), sub_batch_size, fuse, fthp->iter_storage, ficp->xform_distrib, &(fthp->rc)); + badcount = flam3_iterate(&(fthp->cp), sub_batch_size, fuse, start, fthp->iter_storage, ficp->xform_distrib, &(fthp->rc)); #if defined(HAVE_LIBPTHREAD) && defined(USE_LOCKS) /* Lock mutex for access to accumulator */ @@ -407,12 +409,12 @@ static void iter_thread(void *fth) { ficp->badvals += badcount; /* Put them in the bucket accumulator */ - for (j = 0; j < sub_batch_size*4; j+=4) { + for (j = 0; j < sub_batch_size; j++) { double p0, p1, p00, p11; double dbl_index0,dbl_frac; double interpcolor[4]; int ci, color_index0; - double *p = &(fthp->iter_storage[j]); + const double4 p = fthp->iter_storage[j]; bucket *b; if (fthp->cp.rotate != 0.0) { @@ -532,7 +534,7 @@ static int render_rectangle(flam3_frame *spec, void *out, double nsamples, batch_size; bucket *buckets; abucket *accumulate; - double *points; + double4 *points; double *filter, *temporal_filter, *temporal_deltas, *batch_filter; double ppux=0, ppuy=0; int image_width, image_height; /* size of the image to produce */ @@ -698,7 +700,7 @@ static int render_rectangle(flam3_frame *spec, void *out, /* Just free buckets at the end */ buckets = (bucket *) last_block; accumulate = (abucket *) (last_block + sizeof(bucket) * nbuckets); - points = (double *) (last_block + (sizeof(bucket) + sizeof(abucket)) * nbuckets); + points = (double4 *) (last_block + (sizeof(bucket) + sizeof(abucket)) * nbuckets); if (verbose) { fprintf(stderr, "chaos: "); @@ -874,7 +876,7 @@ static int render_rectangle(flam3_frame *spec, void *out, fth[thi].timer_initialize = 0; } - fth[thi].iter_storage = &(points[thi*(spec->sub_batch_size)*4]); + fth[thi].iter_storage = &(points[thi*spec->sub_batch_size]); fth[thi].fic = &fic; flam3_copy(&(fth[thi].cp),&cp); -- cgit v1.2.3