From 334e82a0a23db8a5c0816756021611bfffe2fa26 Mon Sep 17 00:00:00 2001
From: Lars-Dominik Braun <lars@6xq.net>
Date: Thu, 12 Feb 2015 15:43:26 +0100
Subject: Vectorize c and post matrices as well apply_xform

---
 flam3.c         | 25 +++++++++--------------
 flam3.h         |  8 ++++----
 interpolation.c | 27 ++++++++++---------------
 interpolation.h |  8 ++++----
 variations.c    | 63 ++++++++++++++++++++++++++++-----------------------------
 variations.h    |  2 +-
 vector.h        |  5 +++++
 wscript         |  2 +-
 8 files changed, 66 insertions(+), 74 deletions(-)
 create mode 100644 vector.h
diff --git a/flam3.c b/flam3.c
index 928aed0..6eaedcb 100644
--- a/flam3.c
+++ b/flam3.c
@@ -229,16 +229,13 @@ int flam3_create_chaos_distrib(flam3_genome *cp, int xi, unsigned short *xform_d
 
 int flam3_iterate(flam3_genome *cp, int n, int fuse,  double *samples, unsigned short *xform_distrib, randctx *rc) {
    int i;
-   double p[4], q[4];
+   double4 p, q;
    int consec = 0;
    int badvals = 0;
    int lastxf=0;
    int fn;
    
-   p[0] = samples[0];
-   p[1] = samples[1];
-   p[2] = samples[2];
-   p[3] = samples[3];
+   p = (double4) { samples[0], samples[1], samples[2], samples[3] };
 
    /* Perform precalculations */   
    for (i=0;i<cp->num_xforms;i++)
@@ -252,14 +249,11 @@ int flam3_iterate(flam3_genome *cp, int n, int fuse,  double *samples, unsigned
       else
          fn = xform_distrib[ xorshift_step(rc) & CHOOSE_XFORM_GRAIN_M1 ];
       
-      if (apply_xform(cp, fn, p, q, rc)>0) {
+      if (apply_xform(cp, fn, p, &q, rc)>0) {
          consec ++;
          badvals ++;
          if (consec<5) {
-            p[0] = q[0];
-            p[1] = q[1];
-            p[2] = q[2];
-            p[3] = q[3];
+			p = q;
             i -= 4;
             continue;
          } else
@@ -270,17 +264,14 @@ int flam3_iterate(flam3_genome *cp, int n, int fuse,  double *samples, unsigned
       /* Store the last used transform */
       lastxf = fn+1;
 
-      p[0] = q[0];
-      p[1] = q[1];
-      p[2] = q[2];
-      p[3] = q[3];
+	  p = q;
 
       if (cp->final_xform_enable == 1) {
          if (cp->xform[cp->final_xform_index].opacity==1 || 
                 flam3_random_isaac_01(rc)<cp->xform[cp->final_xform_index].opacity) {
-             apply_xform(cp, cp->final_xform_index, p, q, rc);
+             apply_xform(cp, cp->final_xform_index, p, &q, rc);
              /* Keep the opacity from the original xform */
-             q[3] = p[3];
+			 q = (double4) { q[0], q[1], q[2], p[3] };
          }
       }
 
@@ -296,6 +287,7 @@ int flam3_iterate(flam3_genome *cp, int n, int fuse,  double *samples, unsigned
    return(badvals);
 }
 
+#if 0
 int flam3_xform_preview(flam3_genome *cp, int xi, double range, int numvals, int depth, double *result, randctx *rc) {
 
    /* We will evaluate the 'xi'th xform 'depth' times, over the following values:           */
@@ -345,6 +337,7 @@ int flam3_xform_preview(flam3_genome *cp, int xi, double range, int numvals, int
 
    return(0);
 }         
+#endif
 
 int flam3_colorhist(flam3_genome *cp, int num_batches, randctx *rc, double *hist) {
 
diff --git a/flam3.h b/flam3.h
index a66e40e..4135701 100644
--- a/flam3.h
+++ b/flam3.h
@@ -207,11 +207,12 @@ typedef struct {
 
 } flam3_image_store;
 
+#include "vector.h"
 
 typedef struct xform {
    double var[flam3_nvariations];   /* interp coefs between variations */
-   double c[3][2];      /* the coefs to the affine part of the function */
-   double post[3][2];   /* the post transform */
+   double2 c[3];      /* the coefs to the affine part of the function */
+   double2 post[3];   /* the post transform */
    double density;      /* probability that this function is chosen. 0 - 1 */
    double color;     /* color coords for this function. 0 - 1 */
    double color_speed;  /* scaling factor on color added to current iteration */
@@ -420,8 +421,7 @@ typedef struct xform {
    double radialBlur_zoomvar;
 
    /* Precalculate these values for waves */
-   double waves_dx2;
-   double waves_dy2;
+   double2 waves_d2;
 
    /* If disc2 is used, precalculate these values */
    double disc2_sinadd;
diff --git a/interpolation.c b/interpolation.c
index 0ab1c54..4bef70f 100644
--- a/interpolation.c
+++ b/interpolation.c
@@ -57,7 +57,7 @@ double det_matrix(double s[2][2]) {
    return s[0][0] * s[1][1] - s[0][1] * s[1][0];
 }
 
-int id_matrix(double s[3][2]) {
+int id_matrix(double2 s[3]) {
    return
       (s[0][0] == 1.0) &&
       (s[0][1] == 0.0) &&
@@ -67,7 +67,7 @@ int id_matrix(double s[3][2]) {
       (s[2][1] == 0.0);
 }
 
-int zero_matrix(double s[3][2]) {
+int zero_matrix(double2 s[3]) {
    return
       (s[0][0] == 0.0) &&
       (s[0][1] == 0.0) &&
@@ -88,23 +88,18 @@ void copy_matrix(double to[3][2], double from[3][2]) {
 }
 
 
-void clear_matrix(double m[3][2]) {
-   m[0][0] = 0.0;
-   m[0][1] = 0.0;
-   m[1][0] = 0.0;
-   m[1][1] = 0.0;
-   m[2][0] = 0.0;
-   m[2][1] = 0.0;
+void clear_matrix(double2 m[3]) {
+   const double2 zero = (double2) { 0.0, 0.0 };
+   m[0] = zero;
+   m[1] = zero;
+   m[2] = zero;
 }
 
-void sum_matrix(double s, double m1[3][2], double m2[3][2]) {
+void sum_matrix(double s, const double2 m1[3], double2 m2[3]) {
 
-   m2[0][0] += s * m1[0][0];
-   m2[0][1] += s * m1[0][1];
-   m2[1][0] += s * m1[1][0];
-   m2[1][1] += s * m1[1][1];
-   m2[2][0] += s * m1[2][0];
-   m2[2][1] += s * m1[2][1];
+   m2[0] += s * m1[0];
+   m2[1] += s * m1[1];
+   m2[2] += s * m1[2];
 }
 
 void mult_matrix(double s1[2][2], double s2[2][2], double d[2][2]) {
diff --git a/interpolation.h b/interpolation.h
index 82d3133..da3dfd1 100644
--- a/interpolation.h
+++ b/interpolation.h
@@ -35,11 +35,11 @@ double smoother(double t);
 double get_stagger_coef(double t, double stagger_prc, int num_xforms, int this_xform);
 
 double det_matrix(double s[2][2]);
-int id_matrix(double s[3][2]);
-int zero_matrix(double s[3][2]);
+int id_matrix(double2 s[3]);
+int zero_matrix(double2 s[3]);
 void copy_matrix(double to[3][2], double from[3][2]);
-void clear_matrix(double m[3][2]);
-void sum_matrix(double s, double m1[3][2], double m2[3][2]);
+void clear_matrix(double2 m[3]);
+void sum_matrix(double s, const double2 m1[3], double2 m2[3]);
 void mult_matrix(double s1[2][2], double s2[2][2], double d[2][2]);
 
 int compare_xforms(const void *av, const void *bv);
diff --git a/variations.c b/variations.c
index 27bd52c..fd7eddb 100644
--- a/variations.c
+++ b/variations.c
@@ -37,8 +37,6 @@ extern void sincos(double x, double *s, double *c);
 #define trunc (int)
 #endif
 
-typedef double double2 __attribute__ ((vector_size (sizeof (double)*2)));
-
 typedef struct {
    double precalc_atan, precalc_sina;  /* Precalculated, if needed */
    double precalc_cosa, precalc_sqrt;
@@ -405,12 +403,11 @@ static double2 var15_waves (const double2 in, const flam3_iter_helper * const f,
       p[0] += v * nx;
       p[1] += v * ny; */
 
-   const double2 c1 = (double2) {f->xform->c[1][0], f->xform->c[1][1] };
+   const double2 c1 = f->xform->c[1];
+   const double2 inswap = (double2) { in[1], in[0] };
+   const double2 a = inswap * f->xform->waves_d2;
 
-   const double2 n = in + c1 * (double2) {
-                                 sin( in[1] * f->xform->waves_dx2 ),
-                                 sin( in[0] * f->xform->waves_dy2 ),
-								 };
+   const double2 n = in + c1 * (double2) { sin(a[0]), sin(a[1]), };
 
    return weight * n;
 }
@@ -447,10 +444,7 @@ static double2 var17_popcorn (const double2 in, const flam3_iter_helper * const
    const double dx = tan(3.0*in[1]);
    const double dy = tan(3.0*in[0]);
 
-   const double2 n = in + (double2) {
-                            f->xform->c[2][0] * sin(dx),
-							f->xform->c[2][1] * sin(dy)
-                            };
+   const double2 n = in + f->xform->c[2] * (double2) { sin(dx), sin(dy) };
 
    return weight * n;
 }
@@ -1893,11 +1887,9 @@ static void radial_blur_precalc(flam3_xform *xf) {
 }
 
 static void waves_precalc(flam3_xform *xf) {
-   double dx = xf->c[2][0];
-   double dy = xf->c[2][1];
+   const double2 d = xf->c[2];
 
-   xf->waves_dx2 = 1.0/(dx * dx + EPS);
-   xf->waves_dy2 = 1.0/(dy * dy + EPS);
+   xf->waves_d2 = 1.0/(d * d + EPS);
 }
 
 static void disc2_precalc(flam3_xform *xf) {
@@ -2051,8 +2043,17 @@ int prepare_precalc_flags(flam3_genome *cp) {
    return(0);
 }
 
+/*	Apply affine coordinate transformation
+ */
+static double2 apply_affine (const double2 in, const double2 matrix[3]) {
+	return matrix[0] * in[0] + matrix[1] * in[1] + matrix[2];
+}
+
+static double sum(const double2 in) {
+	return in[0] + in[1];
+}
 
-int apply_xform(flam3_genome *cp, int fn, double *p, double *q, randctx *rc)
+int apply_xform(flam3_genome *cp, int fn, const double4 p, double4 *q_ret, randctx *rc)
 {
    flam3_iter_helper f;
    int var_n;
@@ -2063,22 +2064,21 @@ int apply_xform(flam3_genome *cp, int fn, double *p, double *q, randctx *rc)
 
    s1 = cp->xform[fn].color_speed;
 
-   q[2] = s1 * cp->xform[fn].color + (1.0-s1) * p[2];
-   q[3] = cp->xform[fn].vis_adjusted;
+   const double2 q23 = (double2) {
+         s1 * cp->xform[fn].color + (1.0-s1) * p[2],
+         cp->xform[fn].vis_adjusted,
+		 };
 
    //fprintf(stderr,"%d : %f %f %f\n",fn,cp->xform[fn].c[0][0],cp->xform[fn].c[1][0],cp->xform[fn].c[2][0]);
 
-   const double2 t = (double2) {
-         cp->xform[fn].c[0][0] * p[0] + cp->xform[fn].c[1][0] * p[1] + cp->xform[fn].c[2][0],
-         cp->xform[fn].c[0][1] * p[0] + cp->xform[fn].c[1][1] * p[1] + cp->xform[fn].c[2][1]
-		 };
+   const double2 t = apply_affine ((double2) { p[0], p[1] }, cp->xform[fn].c);
 
    /* Pre-xforms go here, and modify the f.tx and f.ty values */
    if (cp->xform[fn].has_preblur!=0.0)
       var67_pre_blur(t, &f, cp->xform[fn].has_preblur);
 
    /* Always calculate sumsq and sqrt */
-   f.precalc_sumsq = t[0]*t[0] + t[1]*t[1];
+   f.precalc_sumsq = sum(t*t);
    f.precalc_sqrt = sqrt(f.precalc_sumsq);
 
    /* Check to see if we can precalculate any parts */
@@ -2099,7 +2099,6 @@ int apply_xform(flam3_genome *cp, int fn, double *p, double *q, randctx *rc)
 
    f.xform = &(cp->xform[fn]);
 
-   
    double2 accum = (double2) {0.0, 0.0};
    for (var_n=0; var_n < cp->xform[fn].num_active_vars; var_n++) {
       
@@ -2306,22 +2305,22 @@ int apply_xform(flam3_genome *cp, int fn, double *p, double *q, randctx *rc)
       }
 
    }
+   double2 q01;
    /* apply the post transform */
    if (cp->xform[fn].has_post) {
-      q[0] = cp->xform[fn].post[0][0] * accum[0] + cp->xform[fn].post[1][0] * accum[1] + cp->xform[fn].post[2][0];
-      q[1] = cp->xform[fn].post[0][1] * accum[0] + cp->xform[fn].post[1][1] * accum[1] + cp->xform[fn].post[2][1];
+      q01 = apply_affine (accum, cp->xform[fn].post);
    } else {
-      q[0] = accum[0];
-      q[1] = accum[1];
+      q01 = accum;
    }
 
    /* Check for badvalues and return randoms if bad */
-   if (badvalue(q[0]) || badvalue(q[1])) {
-      q[0] = flam3_random_isaac_11(rc);
-      q[1] = flam3_random_isaac_11(rc);
+   if (badvalue(q01[0]) || badvalue(q01[1])) {
+      *q_ret = (double4) { flam3_random_isaac_11(rc), flam3_random_isaac_11(rc), q23[0], q23[1] };
       return(1);
-   } else
+   } else {
+	  *q_ret = (double4) { q01[0], q01[1], q23[0], q23[1] };
       return(0);
+   }
 
 }
 
diff --git a/variations.h b/variations.h
index c606484..bc5f29e 100644
--- a/variations.h
+++ b/variations.h
@@ -24,6 +24,6 @@
 void xform_precalc(flam3_genome *cp, int xi);
 int prepare_precalc_flags(flam3_genome *);
 
-int apply_xform(flam3_genome *cp, int fn, double *p, double *q, randctx *rc);
+int apply_xform(flam3_genome *cp, int fn, const double4 p, double4 *, randctx *rc);
 void initialize_xforms(flam3_genome *thiscp, int start_here);
 #endif
diff --git a/vector.h b/vector.h
new file mode 100644
index 0000000..7633c15
--- /dev/null
+++ b/vector.h
@@ -0,0 +1,5 @@
+#pragma once
+
+typedef double double2 __attribute__ ((vector_size (sizeof (double)*2)));
+typedef double double4 __attribute__ ((vector_size (sizeof (double)*4)));
+
diff --git a/wscript b/wscript
index 68b76f7..07c9deb 100644
--- a/wscript
+++ b/wscript
@@ -26,5 +26,5 @@ def build(bld):
     bld.program (features='c cprogram', source='flam3-render.c', target='flam3-render', use='libflam3 xml2 jpeg png amdlibm pthread', includes='.')
     bld.program (features='c cprogram', source='flam3-genome.c', target='flam3-genome', use='libflam3 xml2 png amdlibm pthread', includes='.')
     bld.program (features='c cprogram', source='flam3-animate.c', target='flam3-animate', use='libflam3 xml2 png amdlibm pthread', includes='.')
-    bld.program (features='c cprogram', source='flam3-convert.c', target='flam3-convert', use='libflam3 xml2 png amdlibm pthread', includes='.')
+    #bld.program (features='c cprogram', source='flam3-convert.c', target='flam3-convert', use='libflam3 xml2 png amdlibm pthread', includes='.')
 
-- 
cgit v1.2.3