From f92a757aae2bd379a60b8b17c7fec673a917701a Mon Sep 17 00:00:00 2001 From: Lars-Dominik Braun Date: Sat, 14 Feb 2015 14:46:44 +0100 Subject: Vectorize accumulation Also vectorizes some color functions and switches to double accumulation buffer. Does not seem to be slower. --- flam3.c | 35 --------------- flam3.h | 4 +- interpolation.c | 31 ++++++++------ palettes.c | 63 +++++++++++---------------- palettes.h | 7 +-- private.h | 2 +- rect.c | 131 ++++++++++++++++++++------------------------------------ wscript | 2 +- 8 files changed, 97 insertions(+), 178 deletions(-) diff --git a/flam3.c b/flam3.c index 2380c6f..4564b3f 100644 --- a/flam3.c +++ b/flam3.c @@ -3312,41 +3312,6 @@ int flam3_estimate_bounding_box(flam3_genome *cp, double eps, int nsamples, return(bv); } - -typedef double bucket_double[5]; -typedef double abucket_double[4]; -typedef unsigned int bucket_int[5]; -typedef unsigned int abucket_int[4]; -typedef float bucket_float[5]; -typedef float abucket_float[4]; - -/* experimental 32-bit datatypes (called 33) */ -#define bucket bucket_int -#define abucket abucket_float -#define abump_no_overflow(dest, delta) do {dest += delta;} while (0) -#define add_c_to_accum(acc,i,ii,j,jj,wid,hgt,c) do { \ - if ( (j) + (jj) >=0 && (j) + (jj) < (hgt) && (i) + (ii) >=0 && (i) + (ii) < (wid)) { \ - abucket *a = (acc) + ( (i) + (ii) ) + ( (j) + (jj) ) * (wid); \ - abump_no_overflow(a[0][0],(c)[0]); \ - abump_no_overflow(a[0][1],(c)[1]); \ - abump_no_overflow(a[0][2],(c)[2]); \ - abump_no_overflow(a[0][3],(c)[3]); \ - } \ -} while (0) -/* single-threaded */ -#define bump_no_overflow(dest, delta) do { \ - if (UINT_MAX - dest > delta) dest += delta; else dest = UINT_MAX; \ -} while (0) -#define iter_thread iter_thread_float -#include "rect.c" -#undef iter_thread -#undef render_rectangle -#undef add_c_to_accum -#undef bucket -#undef abucket -#undef bump_no_overflow -#undef abump_no_overflow - int flam3_render(flam3_frame *spec, void *out, int field, int nchan, int trans, stat_struct *stats) { diff --git a/flam3.h b/flam3.h index 0107672..a92c203 100644 --- a/flam3.h +++ b/flam3.h @@ -36,10 +36,12 @@ char *flam3_version(); #define flam3_print_edits (1) #define flam3_dont_print_edits (0) +#include "vector.h" + //typedef double flam3_palette[256][3]; typedef struct { double index; - double color[4]; + double4 color; } flam3_palette_entry; typedef flam3_palette_entry flam3_palette[256]; diff --git a/interpolation.c b/interpolation.c index 528e312..78b798f 100644 --- a/interpolation.c +++ b/interpolation.c @@ -156,23 +156,25 @@ void interpolate_cmap(flam3_palette cmap, double blend, fprintf(stderr,"unable to retrieve palette %d, setting to white\n", index1); for (i = 0; i < 256; i++) { - double t[5], s[5]; + double4 t, s; + double t4, s4; - rgb2hsv(p0[i].color, s); - rgb2hsv(p1[i].color, t); + s = rgb2hsv(p0[i].color); + t = rgb2hsv(p1[i].color); s[3] = p0[i].color[3]; t[3] = p1[i].color[3]; - s[4] = p0[i].index; - t[4] = p1[i].index; + s4 = p0[i].index; + t4 = p1[i].index; - for (j = 0; j < 5; j++) + for (j = 0; j < 4; j++) t[j] = ((1.0-blend) * s[j]) + (blend * t[j]); + t4 = ((1.0-blend) * s4) + (blend * t4); - hsv2rgb(t, cmap[i].color); + cmap[i].color = hsv2rgb(t); cmap[i].color[3] = t[3]; - cmap[i].index = t[4]; + cmap[i].index = t4; } } @@ -362,29 +364,30 @@ void flam3_interpolate_n(flam3_genome *result, int ncp, if (flam3_palette_interpolation_hsv == cpi[0].palette_interpolation) { for (i = 0; i < 256; i++) { - double t[3], s[5]; + double s4; + double4 s, t; int alpha1 = 1; - s[0] = s[1] = s[2] = s[3] = s[4] = 0.0; + s[0] = s[1] = s[2] = s[3] = s4 = 0.0; for (k = 0; k < ncp; k++) { - rgb2hsv(cpi[k].palette[i].color, t); + t = rgb2hsv(cpi[k].palette[i].color); for (j = 0; j < 3; j++) s[j] += c[k] * t[j]; s[3] += c[k] * cpi[k].palette[i].color[3]; if (cpi[k].palette[i].color[3] != 1.0) alpha1 = 0; - s[4] += c[k] * cpi[k].palette[i].index; + s4 += c[k] * cpi[k].palette[i].index; } if (alpha1 == 1) s[3] = 1.0; - hsv2rgb(s, result->palette[i].color); + result->palette[i].color = hsv2rgb(s); result->palette[i].color[3] = s[3]; - result->palette[i].index = s[4]; + result->palette[i].index = s4; for (j = 0; j < 4; j++) { if (result->palette[i].color[j] < 0.0) diff --git a/palettes.c b/palettes.c index 816f3a2..980a465 100644 --- a/palettes.c +++ b/palettes.c @@ -167,22 +167,24 @@ int flam3_get_palette(int n, flam3_palette c, double hue_rotation, randctx * con /* Loop over elements of cmap */ for (i = 0; i < cmap_len; i++) { int ii = (i * 256) / cmap_len; - double rgb[3], hsv[3]; + double4 rgb, hsv; - /* Colors are in 0-1 space */ - for (j = 0; j < 3; j++) - rgb[j] = the_palettes[idx].colors[ii][j] / 255.0; + rgb = (double4) { + the_palettes[idx].colors[ii][0] / 255.0, + the_palettes[idx].colors[ii][1] / 255.0, + the_palettes[idx].colors[ii][2] / 255.0, + 1.0, + }; - rgb2hsv(rgb, hsv); + hsv = rgb2hsv(rgb); hsv[0] += hue_rotation * 6.0; - hsv2rgb(hsv, rgb); + rgb = hsv2rgb(hsv); c[i].index = i; - for (j = 0; j < 3; j++) - c[i].color[j] = rgb[j]; + c[i].color = rgb; - c[i].color[3] = 1.0; + c[i].color[3] = 1.0; } return n; @@ -196,8 +198,7 @@ int flam3_get_palette(int n, flam3_palette c, double hue_rotation, randctx * con /* rgb 0 - 1, h 0 - 6, s 0 - 1, v 0 - 1 */ -void rgb2hsv(rgb, hsv) - double *rgb; double *hsv; +double4 rgb2hsv(double4 rgb) { double rd, gd, bd, h, s, v, max, min, del, rc, gc, bc; @@ -231,17 +232,13 @@ void rgb2hsv(rgb, hsv) if (h<0) h += 6; } - hsv[0] = h; - hsv[1] = s; - hsv[2] = v; + return (double4) { h, s, v, rgb[3] }; } /* h 0 - 6, s 0 - 1, v 0 - 1 rgb 0 - 1 */ -void hsv2rgb(hsv, rgb) - double *hsv; - double *rgb; +double4 hsv2rgb(double4 hsv) { double h = hsv[0], s = hsv[1], v = hsv[2]; int j; @@ -266,9 +263,7 @@ void hsv2rgb(hsv, rgb) default: rd = v; gd = t; bd = p; break; } - rgb[0] = rd; - rgb[1] = gd; - rgb[2] = bd; + return (double4) { rd, gd, bd, hsv[3] }; } double flam3_calc_alpha(double density, double gamma, double linrange) { @@ -289,19 +284,15 @@ double flam3_calc_alpha(double density, double gamma, double linrange) { return(alpha); } -void flam3_calc_newrgb(double *cbuf, double ls, double highpow, double *newrgb) { +double4 flam3_calc_newrgb(double4 cbuf, double ls, double highpow) { int rgbi; double newls,lsratio; - double newhsv[3]; double a, maxa=-1.0, maxc=0; double adjhlp; if (ls==0.0 || (cbuf[0]==0.0 && cbuf[1]==0.0 && cbuf[2]==0.0)) { - newrgb[0] = 0.0; - newrgb[1] = 0.0; - newrgb[2] = 0.0; - return; + return (double4) { 0, 0, 0, 0 }; } /* Identify the most saturated channel */ @@ -320,17 +311,13 @@ void flam3_calc_newrgb(double *cbuf, double ls, double highpow, double *newrgb) lsratio = pow(newls/ls,highpow); /* Calculate the max-value color (ranged 0 - 1) */ - for (rgbi=0;rgbi<3;rgbi++) - newrgb[rgbi] = newls*(cbuf[rgbi]/PREFILTER_WHITE)/255.0; + double4 newrgb = newls*(cbuf/PREFILTER_WHITE)/255.0; /* Reduce saturation by the lsratio */ - rgb2hsv(newrgb,newhsv); + double4 newhsv = rgb2hsv(newrgb); newhsv[1] *= lsratio; - hsv2rgb(newhsv,newrgb); - for (rgbi=0;rgbi<3;rgbi++) - newrgb[rgbi] *= 255.0; - + return hsv2rgb(newhsv) * 255.0; } else { newls = 255.0/maxc; adjhlp = -highpow; @@ -339,12 +326,10 @@ void flam3_calc_newrgb(double *cbuf, double ls, double highpow, double *newrgb) if (maxa<=255) adjhlp=1.0; - /* Calculate the max-value color (ranged 0 - 1) interpolated with the old behaviour */ - for (rgbi=0;rgbi<3;rgbi++) - newrgb[rgbi] = ((1.0-adjhlp)*newls + adjhlp*ls)*(cbuf[rgbi]/PREFILTER_WHITE); - -// for (rgbi=0;rgbi<3;rgbi++) -// newrgb[rgbi] = ls*(cbuf[rgbi]/PREFILTER_WHITE); + /* Calculate the max-value color (ranged 0 - 1) interpolated with the old + * behaviour */ + + return ((1.0-adjhlp)*newls + adjhlp*ls)*(cbuf/PREFILTER_WHITE); } } diff --git a/palettes.h b/palettes.h index 5d7b617..97b9e94 100644 --- a/palettes.h +++ b/palettes.h @@ -26,12 +26,13 @@ typedef struct { unsigned char colors[256][3]; } lib_palette; +#include "vector.h" -void rgb2hsv(double *rgb, double *hsv); -void hsv2rgb(double *hsv, double *rgb); +double4 rgb2hsv(double4); +double4 hsv2rgb(double4); double flam3_calc_alpha(double density, double gamma, double linrange); -void flam3_calc_newrgb(double *cbuf, double ls, double highpow, double *newrgb); +double4 flam3_calc_newrgb(double4 cbuf, double ls, double highpow); #endif diff --git a/private.h b/private.h index 62cdb71..ae026f5 100644 --- a/private.h +++ b/private.h @@ -62,7 +62,7 @@ typedef struct { double ws0, wb0s0, hs1, hb1s1; /* shortcuts for indexing */ flam3_palette_entry *dmap; /* palette */ double color_scalar; /* <1.0 if non-uniform motion blur is set */ - void *buckets; /* Points to the first accumulator */ + double4 *buckets; /* Points to the first accumulator */ double badvals; /* accumulates all badvalue resets */ double batch_size; int temporal_sample_num,ntemporal_samples; diff --git a/rect.c b/rect.c index 7a64cbd..7e45e16 100644 --- a/rect.c +++ b/rect.c @@ -16,7 +16,12 @@ along with this program. If not, see . */ -/* this file is included into flam3.c once for each buffer bit-width */ +#include + +#include "private.h" +#include "filters.h" +#include "variations.h" +#include "palettes.h" /* * for batch @@ -181,10 +186,9 @@ static void iter_thread(void *fth) { for (j = 0; j < sub_batch_size; j++) { double p0, p1, p00, p11; double dbl_index0,dbl_frac; - double interpcolor[4]; - int ci, color_index0; + double4 interpcolor; + int color_index0; const double4 p = fthp->iter_storage[j]; - bucket *b; if (fthp->cp.rotate != 0.0) { p00 = p[0] - fthp->cp.rot_center[0]; @@ -199,7 +203,6 @@ static void iter_thread(void *fth) { if (p0 >= ficp->bounds[0] && p1 >= ficp->bounds[1] && p0 <= ficp->bounds[2] && p1 <= ficp->bounds[3]) { double logvis=1.0; - bucket *buckets = (bucket *)(ficp->buckets); /* Skip if invisible */ if (p[3]==0) @@ -207,9 +210,6 @@ static void iter_thread(void *fth) { else logvis = p[3]; - b = buckets + (int)(ficp->ws0 * p0 - ficp->wb0s0) + - ficp->width * (int)(ficp->hs1 * p1 - ficp->hb1s1); - dbl_index0 = p[2] * cmap_size; color_index0 = (int) (dbl_index0); @@ -225,11 +225,8 @@ static void iter_thread(void *fth) { dbl_frac = dbl_index0 - (double)color_index0; } - for (ci=0;ci<4;ci++) { - interpcolor[ci] = ficp->dmap[color_index0].color[ci] * (1.0-dbl_frac) + - ficp->dmap[color_index0+1].color[ci] * dbl_frac; - } - + interpcolor = ficp->dmap[color_index0].color * (1.0-dbl_frac) + + ficp->dmap[color_index0+1].color * dbl_frac; } else { /* Palette mode step */ if (color_index0 < 0) { @@ -238,24 +235,15 @@ static void iter_thread(void *fth) { color_index0 = cmap_size_m1; } - for (ci=0;ci<4;ci++) - interpcolor[ci] = ficp->dmap[color_index0].color[ci]; + interpcolor = ficp->dmap[color_index0].color; } - if (p[3]==1.0) { - bump_no_overflow(b[0][0], interpcolor[0]); - bump_no_overflow(b[0][1], interpcolor[1]); - bump_no_overflow(b[0][2], interpcolor[2]); - bump_no_overflow(b[0][3], interpcolor[3]); - bump_no_overflow(b[0][4], 255.0); - } else { - bump_no_overflow(b[0][0], logvis*interpcolor[0]); - bump_no_overflow(b[0][1], logvis*interpcolor[1]); - bump_no_overflow(b[0][2], logvis*interpcolor[2]); - bump_no_overflow(b[0][3], logvis*interpcolor[3]); - bump_no_overflow(b[0][4], logvis*255.0); + if (p[3]!=1.0) { + interpcolor *= logvis; } + ficp->buckets[(int)(ficp->ws0 * p0 - ficp->wb0s0) + ficp->width * (int)(ficp->hs1 * p1 - ficp->hb1s1)] += interpcolor; + } } @@ -266,14 +254,11 @@ static void iter_thread(void *fth) { pthread_exit((void *)0); } -static int render_rectangle(flam3_frame *spec, void *out, +int render_rectangle(flam3_frame *spec, void *out, int field, int nchan, int transp, stat_struct *stats) { long nbuckets; int i, j, k, batch_num, temporal_sample_num; double nsamples, batch_size; - bucket *buckets; - abucket *accumulate; - double4 *points; double *filter, *temporal_filter, *temporal_deltas, *batch_filter; double ppux=0, ppuy=0; int image_width, image_height; /* size of the image to produce */ @@ -304,9 +289,6 @@ static int render_rectangle(flam3_frame *spec, void *out, char *ai; int cmap_size; - char *last_block; - size_t memory_rqd; - /* Per-render progress timers */ time_t progress_timer=0; time_t progress_timer_history[64]; @@ -397,19 +379,16 @@ static int render_rectangle(flam3_frame *spec, void *out, fic.width = oversample * image_width + 2 * gutter_width; nbuckets = (long)fic.width * (long)fic.height; - memory_rqd = (sizeof(bucket) * nbuckets + sizeof(abucket) * nbuckets + - 4 * sizeof(double) * (size_t)(spec->sub_batch_size) * spec->nthreads); - last_block = (char *) malloc(memory_rqd); - if (NULL == last_block) { - fprintf(stderr, "render_rectangle: cannot malloc %g bytes.\n", (double)memory_rqd); - fprintf(stderr, "render_rectangle: w=%d h=%d nb=%ld.\n", fic.width, fic.height, nbuckets); - return(1); - } - /* Just free buckets at the end */ - buckets = (bucket *) last_block; - accumulate = (abucket *) (last_block + sizeof(bucket) * nbuckets); - points = (double4 *) (last_block + (sizeof(bucket) + sizeof(abucket)) * nbuckets); + double4 * const buckets = malloc (nbuckets * sizeof (*buckets)); + assert (buckets != NULL); + double4 * const accumulate = malloc (nbuckets * sizeof (*accumulate)); + assert (accumulate != NULL); + double4 ** const iter_storage = malloc (spec->nthreads * sizeof (*iter_storage)); + assert (iter_storage != NULL); + for (size_t i = 0; i < spec->nthreads; i++) { + iter_storage[i] = malloc (spec->sub_batch_size * sizeof (*iter_storage[i])); + } if (verbose) { fprintf(stderr, "chaos: "); @@ -417,7 +396,7 @@ static int render_rectangle(flam3_frame *spec, void *out, } background[0] = background[1] = background[2] = 0.0; - memset((char *) accumulate, 0, sizeof(abucket) * nbuckets); + memset(accumulate, 0, sizeof(*accumulate) * nbuckets); /* Batch loop - outermost */ @@ -425,7 +404,7 @@ static int render_rectangle(flam3_frame *spec, void *out, double sample_density=0.0; double k1, area, k2; - memset((char *) buckets, 0, sizeof(bucket) * nbuckets); + memset(buckets, 0, sizeof(*buckets) * nbuckets); /* Temporal sample loop */ for (temporal_sample_num = 0; temporal_sample_num < ntemporal_samples; temporal_sample_num++) { @@ -550,7 +529,7 @@ static int render_rectangle(flam3_frame *spec, void *out, fth[thi].timer_initialize = 0; } - fth[thi].iter_storage = &(points[thi*spec->sub_batch_size]); + fth[thi].iter_storage = iter_storage[thi]; fth[thi].fic = &fic; flam3_copy(&(fth[thi].cp),&cp); @@ -610,28 +589,14 @@ static int render_rectangle(flam3_frame *spec, void *out, for (j = 0; j < fic.height; j++) { for (i = 0; i < fic.width; i++) { + const double4 c = buckets[i + j * fic.width]; - abucket *a = accumulate + i + j * fic.width; - bucket *b = buckets + i + j * fic.width; - double c[4], ls; - - c[0] = (double) b[0][0]; - c[1] = (double) b[0][1]; - c[2] = (double) b[0][2]; - c[3] = (double) b[0][3]; if (0.0 == c[3]) continue; - ls = (k1 * log(1.0 + c[3] * k2))/c[3]; - c[0] *= ls; - c[1] *= ls; - c[2] *= ls; - c[3] *= ls; + const double ls = (k1 * log(1.0 + c[3] * k2))/c[3]; - abump_no_overflow(a[0][0], c[0]); - abump_no_overflow(a[0][1], c[1]); - abump_no_overflow(a[0][2], c[2]); - abump_no_overflow(a[0][3], c[3]); + accumulate[i + j * fic.width] += c * ls; } } @@ -646,7 +611,7 @@ static int render_rectangle(flam3_frame *spec, void *out, /* filter the accumulation buffer down into the image */ if (1) { int x, y; - double t[4],newrgb[3]; + double4 t,newrgb; double g = 1.0 / (gamma / vib_gam_n); double tmp,a; double alpha,ls; @@ -666,26 +631,22 @@ static int render_rectangle(flam3_frame *spec, void *out, for (j = 0; j < fic.height; j++) { for (i = 0; i < fic.width; i++) { - - abucket *ac = accumulate + i + j*fic.width; + double4 ac = accumulate[i + j*fic.width]; - if (ac[0][3]<=0) { + if (ac[3]<=0) { alpha = 0.0; ls = 0.0; } else { - tmp=ac[0][3]/PREFILTER_WHITE; + tmp=ac[3]/PREFILTER_WHITE; alpha = flam3_calc_alpha(tmp,g,linrange); ls = vibrancy * 256.0 * alpha / tmp; if (alpha<0.0) alpha = 0.0; if (alpha>1.0) alpha = 1.0; } - t[0] = (double)ac[0][0]; - t[1] = (double)ac[0][1]; - t[2] = (double)ac[0][2]; - t[3] = (double)ac[0][3]; + t = ac; - flam3_calc_newrgb(t, ls, highpow, newrgb); + newrgb = flam3_calc_newrgb(t, ls, highpow); for (rgbi=0;rgbi<3;rgbi++) { a = newrgb[rgbi]; @@ -704,10 +665,12 @@ static int render_rectangle(flam3_frame *spec, void *out, if (a<0) a = 0; /* Replace values in accumulation buffer with these new ones */ - ac[0][rgbi] = a; + ac[rgbi] = a; } - ac[0][3] = alpha; + ac[3] = alpha; + + accumulate[i + j*fic.width] = ac; } } @@ -726,13 +689,13 @@ static int render_rectangle(flam3_frame *spec, void *out, for (ii = 0; ii < filter_width; ii++) { for (jj = 0; jj < filter_width; jj++) { double k = filter[ii + jj * filter_width]; - abucket *ac = accumulate + x+ii + (y+jj)*fic.width; + double4 ac = accumulate[x+ii + (y+jj)*fic.width]; - t[0] += k * ac[0][0]; - t[1] += k * ac[0][1]; - t[2] += k * ac[0][2]; - t[3] += k * ac[0][3]; + t[0] += k * ac[0]; + t[1] += k * ac[1]; + t[2] += k * ac[2]; + t[3] += k * ac[3]; } @@ -757,7 +720,7 @@ static int render_rectangle(flam3_frame *spec, void *out, if (alpha>1.0) alpha = 1.0; } - flam3_calc_newrgb(t, ls, highpow, newrgb); + newrgb = flam3_calc_newrgb(t, ls, highpow); for (rgbi=0;rgbi<3;rgbi++) { a = newrgb[rgbi]; diff --git a/wscript b/wscript index 61459f7..8eb7748 100644 --- a/wscript +++ b/wscript @@ -19,7 +19,7 @@ def configure(conf): conf.write_config_header ('config.h') def build(bld): - bld.stlib (features='c cstlib', source='flam3.c filters.c parser.c variations.c interpolation.c palettes.c png.c random.c docstring.c', target='libflam3', use='xml2 png pthread', includes='.') + bld.stlib (features='c cstlib', source='flam3.c filters.c parser.c variations.c interpolation.c palettes.c png.c random.c docstring.c rect.c', target='libflam3', use='xml2 png pthread', includes='.') bld.program (features='c cprogram', source='flam3-render.c', target='flam3-render', use='libflam3 xml2 png amdlibm pthread', includes='.') bld.program (features='c cprogram', source='flam3-genome.c', target='flam3-genome', use='libflam3 xml2 png amdlibm pthread', includes='.') bld.program (features='c cprogram', source='flam3-animate.c', target='flam3-animate', use='libflam3 xml2 png amdlibm pthread', includes='.') -- cgit v1.2.3