From 4d2d896e28446928d820bf1353abcf40e3f66ed8 Mon Sep 17 00:00:00 2001 From: Lars-Dominik Braun Date: Sun, 1 Mar 2015 13:02:50 +0100 Subject: Switch to OpenMP Replaces quality (target density) parameter with time limit. In preparation for rendering resumption. --- flam3.c | 32 +--- flam3.h | 6 +- main.c | 59 +++---- palettes.c | 12 +- private.h | 2 - rect.c | 545 +++++++++++++++++++++++-------------------------------------- rect.h | 14 +- vector.h | 2 + wscript | 5 +- 9 files changed, 256 insertions(+), 421 deletions(-) diff --git a/flam3.c b/flam3.c index 3bb89cd..873efbd 100644 --- a/flam3.c +++ b/flam3.c @@ -37,8 +37,6 @@ #include #include -#include - char *flam3_version() { return VERSION; } @@ -182,7 +180,7 @@ int flam3_create_chaos_distrib(flam3_genome *cp, int xi, unsigned short *xform_d */ -int flam3_iterate(flam3_genome *cp, int n, int fuse, const double4 in, double4 *samples, unsigned short *xform_distrib, randctx *rc) { +int flam3_iterate(flam3_genome *cp, int n, int fuse, const double4 in, double4 *samples, const unsigned short *xform_distrib, randctx *rc) { int i; double4 p, q; int consec = 0; @@ -933,7 +931,7 @@ void flam3_copy_xform(flam3_xform *dest, flam3_xform *src) { } /* Copy one control point to another */ -void flam3_copy(flam3_genome *dest, flam3_genome *src) { +void flam3_copy(flam3_genome *dest, const flam3_genome * const src) { int i,ii; int numstd; @@ -1107,32 +1105,6 @@ void clear_cp(flam3_genome *cp, int default_flag) { } - -int flam3_count_nthreads(void) { - int nthreads; - -#ifndef _SC_NPROCESSORS_ONLN - char line[MAXBUF]; - FILE *f = fopen("/proc/cpuinfo", "r"); - if (NULL == f) goto def; - nthreads = 0; - while (fgets(line, MAXBUF, f)) { - if (!strncmp("processor\t:", line, 11)) - nthreads++; - } - fclose(f); - if (nthreads < 1) goto def; - return (nthreads); -def: - fprintf(stderr, "could not read /proc/cpuinfo, using one render thread.\n"); - nthreads = 1; -#else - nthreads = sysconf(_SC_NPROCESSORS_ONLN); - if (nthreads < 1) nthreads = 1; -#endif - return (nthreads); -} - flam3_genome *flam3_parse_xml2(char *xmldata, char *xmlfilename, int default_flag, int *ncps, randctx * const rc) { xmlDocPtr doc; /* Parsed XML document tree */ diff --git a/flam3.h b/flam3.h index 3c17ffd..ac8278d 100644 --- a/flam3.h +++ b/flam3.h @@ -508,7 +508,7 @@ void flam3_add_motion_element(flam3_xform *xf); void flam3_add_xforms(flam3_genome *cp, int num_to_add, int interp_padding, int final_flag); void flam3_delete_xform(flam3_genome *thiscp, int idx_to_delete); void flam3_copy_xform(flam3_xform *dest, flam3_xform *src); -void flam3_copy(flam3_genome *dest, flam3_genome *src); +void flam3_copy(flam3_genome *dest, const flam3_genome * const src); void flam3_copyx(flam3_genome *dest, flam3_genome *src, int num_std, int num_final); void flam3_copy_params(flam3_xform *dest, flam3_xform *src, int varn); void flam3_delete_motion_elements(flam3_xform *xf); @@ -523,7 +523,7 @@ void clear_cp(flam3_genome *cp, int def_flag); (samples[2], samples[3]) as starting color coordinate, perform fuse iterations and throw them away, then perform nsamples iterations and save them in the samples array */ -int flam3_iterate(flam3_genome *cp, int n, int fuse, const double4 in, double4 *samples, unsigned short *xform_distrib, randctx *rc); +int flam3_iterate(flam3_genome *cp, int n, int fuse, const double4 in, double4 *samples, const unsigned short *xform_distrib, randctx *rc); void apply_motion_parameters(flam3_xform *xf, flam3_xform *addto, double blend); @@ -564,8 +564,6 @@ double flam3_lyapunov(flam3_genome *g, int ntries); void flam3_apply_template(flam3_genome *cp, flam3_genome *templ); -int flam3_count_nthreads(void); - typedef struct { double pixel_aspect_ratio; /* width over height of each pixel */ flam3_genome *genomes; diff --git a/main.c b/main.c index 931fee0..7a699c3 100644 --- a/main.c +++ b/main.c @@ -33,8 +33,8 @@ const char *argp_program_version = "vlam3-pre"; typedef struct { - unsigned int threads, bpc, quality; - float scale; + unsigned int bpc; + float scale, time; } render_arguments; static error_t parse_render_opt (int key, char *arg, @@ -51,12 +51,12 @@ static error_t parse_render_opt (int key, char *arg, break; } - case 'q': { - int i = atoi (arg); - if (i < 1) { - argp_error (state, "Quality must be >= 1"); + case 't': { + float i = atof (arg); + if (i <= 0) { + argp_error (state, "Time must be > 0"); } else { - arguments->quality = i; + arguments->time = i; } break; } @@ -68,16 +68,6 @@ static error_t parse_render_opt (int key, char *arg, } break; - case 't': { - int i = atoi (arg); - if (i <= 0) { - argp_error (state, "Threads must be > 0"); - } else { - arguments->threads = i; - } - break; - } - case ARGP_KEY_ARG: if (state->arg_num > 0) { return ARGP_ERR_UNKNOWN; @@ -110,36 +100,27 @@ static void do_render (const render_arguments * const arguments) { flam3_genome * const genome = &cps[0]; - genome->sample_density = arguments->quality; genome->height *= arguments->scale; genome->width *= arguments->scale; genome->pixels_per_unit *= arguments->scale; - flam3_frame f; - f.genomes = genome; - f.ngenomes = 1; - f.time = 0.0; - f.pixel_aspect_ratio = 1.0; - f.progress = 0; - f.nthreads = arguments->threads; - f.earlyclip = 0; - f.sub_batch_size = 10000; - f.bytes_per_channel = arguments->bpc / 8; - + const unsigned int bytes_per_channel = arguments->bpc/8; const unsigned int channels = 4; const size_t this_size = channels * genome->width * genome->height * - f.bytes_per_channel; + bytes_per_channel; void *image = (void *) calloc(this_size, sizeof(char)); - stat_struct stats; - if (render_parallel (&f, image, &stats)) { - fprintf(stderr,"error rendering image: aborting.\n"); - exit(1); - } + bucket bucket; + bucket_init (&bucket, (uint2) { genome->width, genome->height }); + + render_bucket (genome, &bucket, arguments->time); + fprintf (stderr, "%lu samples, %lu bad\n", + bucket.samples, bucket.badvals); + render_image (genome, &bucket, image, bytes_per_channel); flam3_img_comments fpc; write_png (stdout, image, genome->width, genome->height, &fpc, - f.bytes_per_channel); + bytes_per_channel); } static void print_genome (flam3_genome * const genome) { @@ -492,10 +473,9 @@ int main (int argc, char **argv) { } else if (streq (command, "render")) { /* render flame to image file */ const struct argp_option options[] = { - {"threads", 't', "num", 0, "Number of threads (auto)" }, {"scale", 's', "factor", 0, "Scale image dimensions by factor (1.0)" }, {"bpc", 'b', "8|16", 0, "Bits per channel of output image (8)" }, - {"quality", 'q', "num", 0, "Average samples per pixel (100)" }, + {"time", 't', "seconds", 0, "Rendering time" }, {"width", 'w', "pixels", 0, "Output image width" }, {"height", 'h', "pixels", 0, "Output image height" }, { 0 }, @@ -507,10 +487,9 @@ int main (int argc, char **argv) { }; render_arguments arguments = { - .threads = flam3_count_nthreads(), .bpc = 8, .scale = 1.0, - .quality = 100, + .time = 1.0, }; argp_parse (&argp, argc, argv, 0, NULL, &arguments); diff --git a/palettes.c b/palettes.c index f73bd24..cf76003 100644 --- a/palettes.c +++ b/palettes.c @@ -16,6 +16,8 @@ along with this program. If not, see . */ +#include + #include "private.h" #include "palettes.h" #include "rect.h" @@ -353,7 +355,6 @@ static double try_colors(flam3_genome *g, int color_resolution) { flam3_genome saved; double scalar; int pixtotal; - stat_struct stats; memset(&saved, 0, sizeof(flam3_genome)); @@ -382,10 +383,11 @@ static double try_colors(flam3_genome *g, int color_resolution) { f.sub_batch_size = 10000; image = (unsigned char *) calloc(g->width * g->height, 3); - if (render_parallel (&f, image, &stats)) { - fprintf(stderr,"Error rendering test image for trycolors. Aborting.\n"); - return(-1); - } + + bucket bucket; + bucket_init (&bucket, (uint2) { g->width, g->height }); + render_bucket (g, &bucket, 0.2); + render_image (g, &bucket, image, f.bytes_per_channel); hist = calloc(sizeof(int), res3); p = image; diff --git a/private.h b/private.h index ff0d1c1..3f2b044 100644 --- a/private.h +++ b/private.h @@ -33,8 +33,6 @@ #include -#include - #define EPS (1e-10) #define CMAP_SIZE 256 #define CMAP_SIZE_M1 255 diff --git a/rect.c b/rect.c index 172bd4e..0d9a9db 100644 --- a/rect.c +++ b/rect.c @@ -18,6 +18,7 @@ #include #include +#include #include "private.h" #include "variations.h" @@ -25,35 +26,20 @@ #include "math.h" #include "rect.h" -/* allow this many iterations for settling into attractor */ -#define FUSE_27 15 -#define FUSE_28 100 - -/* Structures for passing parameters to iteration threads */ typedef struct { - unsigned short *xform_distrib; /* Distribution of xforms based on weights */ - flam3_frame *spec; /* Frame contains timing information */ - double bounds[4]; /* Corner coords of viewable area */ - double2 rot[3]; /* Rotation transformation */ - double size[2]; - int width, height; /* buffer width/height */ - double ws0, wb0s0, hs1, hb1s1; /* shortcuts for indexing */ - flam3_palette_entry *dmap; /* palette */ - double color_scalar; /* <1.0 if non-uniform motion blur is set */ - double4 *buckets; /* Points to the first accumulator */ - double badvals; /* accumulates all badvalue resets */ - double batch_size; - int aborted, cmap_size; - /* mutex for bucket accumulator */ - pthread_mutex_t bucket_mutex; -} flam3_iter_constants; + double timelimit; + unsigned int sub_batch_size, fuse; + unsigned short *xform_distrib; -typedef struct { - flam3_genome cp; /* Full copy of genome for use by the thread */ - flam3_iter_constants *fic; /* Constants for render */ - /* thread number */ - size_t i; -} flam3_thread_helper; + flam3_palette dmap; + unsigned int cmap_size; + + /* camera stuff */ + double ws0, wb0s0, hs1, hb1s1; /* shortcuts for indexing */ + double bounds[4]; /* Corner coords of viewable area */ + double2 rot[3]; /* Rotation transformation */ + double ppux, ppuy; +} render_constants; /* Lookup color [0,1] */ @@ -90,92 +76,87 @@ static double4 color_palette_lookup (const double color, } } -static void *iter_thread(void *fth) { - double sub_batch; - int j; - flam3_thread_helper *fthp = (flam3_thread_helper *)fth; - flam3_iter_constants *ficp = fthp->fic; - int SBS = ficp->spec->sub_batch_size; - int fuse; - int cmap_size = ficp->cmap_size; - double4 *iter_storage; - randctx rc; - - rand_seed (&rc); - - int ret = posix_memalign ((void **) &iter_storage, sizeof (*iter_storage), - SBS * sizeof (*iter_storage)); - assert (ret == 0); - assert (iter_storage != NULL); - - fuse = (ficp->spec->earlyclip) ? FUSE_28 : FUSE_27; - - for (sub_batch = 0; sub_batch < ficp->batch_size; sub_batch+=SBS) { - int sub_batch_size, badcount; - /* sub_batch is double so this is sketchy */ - sub_batch_size = (sub_batch + SBS > ficp->batch_size) ? - (ficp->batch_size - sub_batch) : SBS; - - /* Seed iterations */ - const double4 start = (double4) { - rand_d11(&rc), - rand_d11(&rc), - rand_d01(&rc), - rand_d01(&rc), - }; - - /* Execute iterations */ - badcount = flam3_iterate(&(fthp->cp), sub_batch_size, fuse, start, iter_storage, ficp->xform_distrib, &rc); - - /* Lock mutex for access to accumulator */ - pthread_mutex_lock(&ficp->bucket_mutex); - - /* Add the badcount to the counter */ - ficp->badvals += badcount; - - /* Put them in the bucket accumulator */ - for (j = 0; j < sub_batch_size; j++) { - double4 p = iter_storage[j]; - - if (fthp->cp.rotate != 0.0) { - const double2 p01 = (double2) { p[0], p[1] }; - const double2 rotatedp = apply_affine (p01, ficp->rot); - p[0] = rotatedp[0]; - p[1] = rotatedp[1]; - } - - /* Skip if out of bounding box or invisible */ - if (p[0] >= ficp->bounds[0] && p[1] >= ficp->bounds[1] && - p[0] <= ficp->bounds[2] && p[1] <= ficp->bounds[3] && - p[3] > 0) { - const size_t ix = (int)(ficp->ws0 * p[0] - ficp->wb0s0) + ficp->width * (int)(ficp->hs1 * p[1] - ficp->hb1s1); +static void iter_thread (flam3_genome * const input_genome, + bucket * const bucket, const render_constants * const c, + volatile bool * const stopped) { + randctx rc; + rand_seed (&rc); + + flam3_genome genome; + flam3_copy (&genome, input_genome); + + double4 *iter_storage; + int ret = posix_memalign ((void **) &iter_storage, sizeof (*iter_storage), + c->sub_batch_size * sizeof (*iter_storage)); + assert (ret == 0); + assert (iter_storage != NULL); + + const double starttime = omp_get_wtime (); + + do { + /* Seed iterations */ + const double4 start = (double4) { + rand_d11(&rc), + rand_d11(&rc), + rand_d01(&rc), + rand_d01(&rc), + }; + + /* Execute iterations */ + const unsigned long badcount = flam3_iterate(&genome, + c->sub_batch_size, c->fuse, start, iter_storage, + c->xform_distrib, &rc); + +#pragma omp critical + { + /* Add the badcount to the counter */ + bucket->badvals += badcount; + bucket->samples += c->sub_batch_size; + + /* Put them in the bucket accumulator */ + for (unsigned int j = 0; j < c->sub_batch_size; j++) { + double4 p = iter_storage[j]; + + if (genome.rotate != 0.0) { + const double2 p01 = (double2) { p[0], p[1] }; + const double2 rotatedp = apply_affine (p01, c->rot); + p[0] = rotatedp[0]; + p[1] = rotatedp[1]; + } + + /* Skip if out of bounding box or invisible */ + if (p[0] >= c->bounds[0] && p[1] >= c->bounds[1] && + p[0] <= c->bounds[2] && p[1] <= c->bounds[3] && + p[3] > 0) { + const size_t ix = (int)(c->ws0 * p[0] - c->wb0s0) + bucket->dim[0] * (int)(c->hs1 * p[1] - c->hb1s1); #if HAVE_BUILTIN_PREFETCH - /* prefetch for reading (0) with no locality (0). This (partially) - * hides the load latency for the += operation at the end of this - * block */ - __builtin_prefetch (&ficp->buckets[ix], 0, 0); + /* prefetch for reading (0) with no locality (0). This (partially) + * hides the load latency for the += operation at the end of this + * block */ + __builtin_prefetch (&bucket->data[ix], 0, 0); #endif - double4 interpcolor = color_palette_lookup (p[2], - fthp->cp.palette_mode, ficp->dmap, cmap_size); - - const double logvis = p[3]; - if (logvis != 1.0) { - interpcolor *= logvis; - } + double4 interpcolor = color_palette_lookup (p[2], + genome.palette_mode, c->dmap, c->cmap_size); - ficp->buckets[ix] += interpcolor; + const double logvis = p[3]; + if (logvis != 1.0) { + interpcolor *= logvis; + } - } - } - - /* Release mutex */ - pthread_mutex_unlock(&ficp->bucket_mutex); - - } + bucket->data[ix] += interpcolor; + } + } + } +#pragma omp master + { + if (omp_get_wtime () - starttime > c->timelimit) { + *stopped = true; + } + } + } while (!(*stopped)); - free (iter_storage); - return NULL; + free (iter_storage); } /* Perform clipping @@ -206,252 +187,142 @@ static double4 clip (const double4 in, const double g, const double linrange, return newrgb; } -int render_parallel (flam3_frame *spec, void *out, stat_struct *stats) { - long nbuckets; - double ppux=0, ppuy=0; - int image_width, image_height; /* size of the image to produce */ - int out_width; - int bytes_per_channel = spec->bytes_per_channel; - double highpow; - flam3_palette dmap; - double vibrancy = 0.0; - double gamma = 0.0; - int vib_gam_n = 0; - flam3_genome cp; - unsigned short *xform_distrib; - flam3_iter_constants fic; - flam3_thread_helper *fth; - pthread_attr_t pt_attr; - pthread_t *myThreads=NULL; - int thi; - int cmap_size; - - fic.badvals = 0; - fic.aborted = 0; - - stats->num_iters = 0; - - /* correct for apophysis's use of 255 colors in the palette rather than all 256 */ - cmap_size = 256; - - memset(&cp,0, sizeof(flam3_genome)); - - /* interpolate and get a control point */ - flam3_interpolate(spec->genomes, spec->ngenomes, spec->time, 0, &cp); - highpow = cp.highlight_power; - - /* Initialize the thread helper structures */ - fth = (flam3_thread_helper *)calloc(spec->nthreads,sizeof(flam3_thread_helper)); - for (unsigned int i=0;inthreads;i++) - fth[i].cp.final_xform_index=-1; - - /* Set up the output image dimensions, adjusted for scanline */ - const unsigned int channels = 4; - image_width = cp.width; - out_width = image_width; - image_height = cp.height; - - /* Allocate the space required to render the image */ - fic.height = image_height; - fic.width = image_width; - - nbuckets = (long)fic.width * (long)fic.height; - - double4 *buckets; - int ret = posix_memalign ((void **) &buckets, sizeof (*buckets), - nbuckets * sizeof (*buckets)); - assert (ret == 0); - assert (buckets != NULL); - memset (buckets, 0, nbuckets * sizeof (*buckets)); - - double sample_density=0.0; - - /* Batch loop - outermost */ - { - - - { - - /* Get the xforms ready to render */ - if (prepare_precalc_flags(&cp)) { - fprintf(stderr,"prepare xform pointers returned error: aborting.\n"); - return(1); - } - xform_distrib = flam3_create_xform_distrib(&cp); - if (xform_distrib==NULL) { - fprintf(stderr,"create xform distrib returned error: aborting.\n"); - return(1); - } - - /* compute the colormap entries. */ - /* the input colormap is 256 long with entries from 0 to 1.0 */ - for (unsigned int j = 0; j < CMAP_SIZE; j++) { - dmap[j].index = cp.palette[(j * 256) / CMAP_SIZE].index / 256.0; - for (unsigned int k = 0; k < 4; k++) - dmap[j].color[k] = cp.palette[(j * 256) / CMAP_SIZE].color[k]; - } - - /* compute camera */ - { - double corner0, corner1; - double scale; - - if (cp.sample_density <= 0.0) { - fprintf(stderr, - "sample density (quality) must be greater than zero," - " not %g.\n", cp.sample_density); - return(1); - } - - scale = pow(2.0, cp.zoom); - sample_density = cp.sample_density * scale * scale; - - ppux = cp.pixels_per_unit * scale; - ppuy = ppux; - ppux /= spec->pixel_aspect_ratio; - corner0 = cp.center[0] - image_width / ppux / 2.0; - corner1 = cp.center[1] - image_height / ppuy / 2.0; - fic.bounds[0] = corner0; - fic.bounds[1] = corner1; - fic.bounds[2] = corner0 + image_width / ppux; - fic.bounds[3] = corner1 + image_height / ppuy; - fic.size[0] = 1.0 / (fic.bounds[2] - fic.bounds[0]); - fic.size[1] = 1.0 / (fic.bounds[3] - fic.bounds[1]); - rotate_center ((double2) { cp.rot_center[0], cp.rot_center[1] }, - cp.rotate, fic.rot); - fic.ws0 = fic.width * fic.size[0]; - fic.wb0s0 = fic.ws0 * fic.bounds[0]; - fic.hs1 = fic.height * fic.size[1]; - fic.hb1s1 = fic.hs1 * fic.bounds[1]; - - } - - /* number of samples is based only on the output image size */ - double nsamples = sample_density * image_width * image_height; - - /* how many of these samples are rendered in this loop? */ - double batch_size = nsamples; - - stats->num_iters += batch_size; - - /* Fill in the iter constants */ - fic.xform_distrib = xform_distrib; - fic.spec = spec; - fic.batch_size = batch_size / (double)spec->nthreads; - fic.cmap_size = cmap_size; - - fic.dmap = (flam3_palette_entry *)dmap; - fic.buckets = (void *)buckets; - - /* Initialize the thread helper structures */ - for (thi = 0; thi < spec->nthreads; thi++) { - fth[thi].fic = &fic; - fth[thi].i = thi; - flam3_copy(&(fth[thi].cp),&cp); - } - - /* Let's make some threads */ - myThreads = (pthread_t *)malloc(spec->nthreads * sizeof(pthread_t)); - - pthread_mutex_init(&fic.bucket_mutex, NULL); - - pthread_attr_init(&pt_attr); - pthread_attr_setdetachstate(&pt_attr,PTHREAD_CREATE_JOINABLE); - - for (thi=0; thi nthreads; thi ++) - pthread_create(&myThreads[thi], &pt_attr, (void *)iter_thread, (void *)(&(fth[thi]))); - - pthread_attr_destroy(&pt_attr); - - /* Wait for them to return */ - for (thi=0; thi < spec->nthreads; thi++) - pthread_join(myThreads[thi], NULL); - - pthread_mutex_destroy(&fic.bucket_mutex); - - free(myThreads); - - /* Free the xform_distrib array */ - free(xform_distrib); - - if (fic.aborted) { - goto done; - } - - vibrancy += cp.vibrancy; - gamma += cp.gamma; - vib_gam_n++; - - } - - -#if 0 - printf("iw=%d,ih=%d,ppux=%f,ppuy=%f\n",image_width,image_height,ppux,ppuy); - printf("contrast=%f, brightness=%f, PREFILTER=%d\n", - cp.contrast, cp.brightness, PREFILTER_WHITE); - printf("area = %f, WHITE_LEVEL=%d, sample_density=%f\n", - area, WHITE_LEVEL, sample_density); - printf("k1=%f,k2=%15.12f\n",k1,k2); -#endif - - } +void bucket_init (bucket * const b, const uint2 dim) { + memset (b, 0, sizeof (*b)); + b->dim = dim; - /* filter the accumulation buffer down into the image */ - if (1) { - const double g = 1.0 / (gamma / vib_gam_n); + size_t size = dim[0] * dim[1] * sizeof (*b->data); + int ret = posix_memalign ((void **) &b->data, sizeof (*b->data), size); + assert (ret == 0); + assert (b->data != NULL); + memset (b->data, 0, size); +} - double linrange = cp.gam_lin_thresh; +static void compute_camera (const flam3_genome * const genome, + const bucket * const bucket, render_constants * const c) { + assert (genome != NULL); + assert (bucket != NULL); + assert (c != NULL); + + double corner0, corner1; + + const double scale = pow(2.0, genome->zoom); + + c->ppux = genome->pixels_per_unit * scale; + c->ppuy = c->ppux; + //ppux /= spec->pixel_aspect_ratio; + corner0 = genome->center[0] - bucket->dim[0] / c->ppux / 2.0; + corner1 = genome->center[1] - bucket->dim[1] / c->ppuy / 2.0; + c->bounds[0] = corner0; + c->bounds[1] = corner1; + c->bounds[2] = corner0 + bucket->dim[0] / c->ppux; + c->bounds[3] = corner1 + bucket->dim[1] / c->ppuy; + const double size[2] = {1.0 / (c->bounds[2] - c->bounds[0]), + 1.0 / (c->bounds[3] - c->bounds[1])}; + rotate_center ((double2) { genome->rot_center[0], genome->rot_center[1] }, + genome->rotate, c->rot); + c->ws0 = bucket->dim[0] * size[0]; + c->wb0s0 = c->ws0 * c->bounds[0]; + c->hs1 = bucket->dim[1] * size[1]; + c->hb1s1 = c->hs1 * c->bounds[1]; +} - vibrancy /= vib_gam_n; - - /* XXX: the original formula has a factor 268/256 in here, not sure why */ - const double k1 = cp.contrast * cp.brightness; - const double area = image_width * image_height / (ppux * ppuy); - const double k2 = 1.0 / (cp.contrast * area * sample_density); +bool render_bucket (flam3_genome * const genome, bucket * const bucket, + const double timelimit) { + assert (bucket != NULL); + assert (genome != NULL); + + int ret = prepare_precalc_flags(genome); + assert (ret == 0); + + render_constants c = { + .fuse = 100, + .sub_batch_size = 10000, + .xform_distrib = flam3_create_xform_distrib(genome), + .timelimit = timelimit, + }; + assert (c.xform_distrib != NULL); + + /* compute the colormap entries. */ + /* the input colormap is 256 long with entries from 0 to 1.0 */ + for (unsigned int j = 0; j < CMAP_SIZE; j++) { + c.dmap[j].index = genome->palette[(j * 256) / CMAP_SIZE].index / 256.0; + for (unsigned int k = 0; k < 4; k++) { + c.dmap[j].color[k] = genome->palette[(j * 256) / CMAP_SIZE].color[k]; + } + } + c.cmap_size = 256; - for (unsigned int y = 0; y < image_height; y++) { - for (unsigned int x = 0; x < image_width; x++) { - double4 t = buckets[x + y * fic.width]; + /* compute camera */ + compute_camera (genome, bucket, &c); - const double ls = (k1 * log(1.0 + t[3] * k2))/t[3]; + bool stopped = false; +#pragma omp parallel shared(stopped) + iter_thread (genome, bucket, &c, &stopped); - t = t * ls; - t = clip (t, g, linrange, highpow, vibrancy); + free (c.xform_distrib); - const double maxval = (1 << (bytes_per_channel*8)) - 1; - t = nearbyint_d4 (t * maxval); + return true; +} - if (bytes_per_channel == 2) { - uint16_t * const p = &((uint16_t *) out)[channels * (x + y * out_width)]; +void render_image (const flam3_genome * const genome, const bucket * const bucket, + void * const out, const unsigned int bytes_per_channel) { + assert (genome != NULL); + assert (bucket != NULL); + assert (bucket->data != NULL); + + const unsigned int pixels = bucket->dim[0] * bucket->dim[1]; + const unsigned int channels = 4; + + /* XXX: copied from above */ + const double scale = pow(2.0, genome->zoom); + const double ppux = genome->pixels_per_unit * scale; + const double ppuy = ppux; + + const double sample_density = (double) bucket->samples / (double) pixels; + const double g = 1.0 / genome->gamma; + const double linrange = genome->gam_lin_thresh; + const double vibrancy = genome->vibrancy; + /* XXX: the original formula has a factor 268/256 in here, not sure why */ + const double k1 = genome->contrast * genome->brightness; + const double area = (double) pixels / (ppux * ppuy); + const double k2 = 1.0 / (genome->contrast * area * sample_density); + const double highpow = genome->highlight_power; + +#pragma omp parallel for + for (unsigned int i = 0; i < pixels; i++) { + double4 t = bucket->data[i]; + + const double ls = (k1 * log(1.0 + t[3] * k2))/t[3]; + + t = t * ls; + t = clip (t, g, linrange, highpow, vibrancy); + + const double maxval = (1 << (bytes_per_channel*8)) - 1; + t = nearbyint_d4 (t * maxval); + + switch (bytes_per_channel) { + case 2: { + uint16_t * const p = &((uint16_t *) out)[channels * i]; p[0] = t[0]; p[1] = t[1]; p[2] = t[2]; p[3] = t[3]; - } else if (bytes_per_channel == 1) { - uint8_t * const p = &((uint8_t *) out)[channels * (x + y * out_width)]; + break; + } + + case 1: { + uint8_t * const p = &((uint8_t *) out)[channels * i]; p[0] = t[0]; p[1] = t[1]; p[2] = t[2]; p[3] = t[3]; - } else { - assert (0); + break; } - } - } - } - - done: - - stats->badvals = fic.badvals; - - free(buckets); - /* We have to clear the cps in fth first */ - for (thi = 0; thi < spec->nthreads; thi++) { - clear_cp(&(fth[thi].cp),0); - } - free(fth); - clear_cp(&cp,0); - - return(0); + default: + assert (0); + break; + } + } } diff --git a/rect.h b/rect.h index 3def002..39c76cb 100644 --- a/rect.h +++ b/rect.h @@ -1,6 +1,18 @@ #pragma once #include -int render_parallel (flam3_frame *spec, void *out, stat_struct *stats); +#include "vector.h" +typedef struct { + /* bucket width/height */ + uint2 dim; + double4 *data; + unsigned long int badvals, samples; +} bucket; + +void bucket_init (bucket * const b, const uint2 dim); +bool render_bucket (flam3_genome * const genome, bucket * const bucket, + const double timelimit); +void render_image (const flam3_genome * const genome, const bucket * const b, + void * const out, const unsigned int bytes_per_channel); diff --git a/vector.h b/vector.h index 6e65186..2c4ff52 100644 --- a/vector.h +++ b/vector.h @@ -5,10 +5,12 @@ /* LLVM/clang */ typedef double double2 __attribute__ ((ext_vector_type (2))); typedef double double4 __attribute__ ((ext_vector_type (4))); +typedef unsigned int uint2 __attribute__ ((ext_vector_type (2))); #endif #else /* GCC */ typedef double double2 __attribute__ ((vector_size (sizeof (double)*2))); typedef double double4 __attribute__ ((vector_size (sizeof (double)*4))); +typedef unsigned int uint2 __attribute__ ((vector_size (sizeof (unsigned int)*2))); #endif diff --git a/wscript b/wscript index 732086e..c6ed83f 100644 --- a/wscript +++ b/wscript @@ -7,10 +7,11 @@ def configure(conf): conf.env.append_unique ('CFLAGS', '-std=gnu99') conf.env.append_unique ('CFLAGS', '-D_GNU_SOURCE') + conf.env.append_unique ('CFLAGS', '-fopenmp') + conf.env.append_unique ('LINKFLAGS', '-fopenmp') conf.check_cfg (path='xml2-config', args='--cflags --libs', package='', uselib_store='xml2') conf.check_cc (lib='xml2', header_name='libxml/parser.h', function_name='xmlParseFile', use='xml2') - conf.check_cc (lib='pthread', uselib_store='pthread') conf.check_cfg (package='libpng', uselib_store='png', args=['--cflags', '--libs'], msg='Checking for library png') conf.check_cc (lib='amdlibm', header_name='amdlibm.h', mandatory=False, define_name='HAVE_AMDLIBM', uselib_store='amdlibm') @@ -19,5 +20,5 @@ def configure(conf): conf.write_config_header ('config.h') def build(bld): - bld.program (features='c cprogram', source='flam3.c parser.c variations.c interpolation.c palettes.c png.c random.c rect.c main.c', target='vlam3', use='xml2 png amdlibm pthread') + bld.program (features='c cprogram', source='flam3.c parser.c variations.c interpolation.c palettes.c png.c random.c rect.c main.c', target='vlam3', use='xml2 png amdlibm') -- cgit v1.2.3