| 1 | /* |
|---|
| 2 | * Copyright (C) 2007 Michael Lewis |
|---|
| 3 | * Author: Mike Lewis <mikelikespie@gmail.com> |
|---|
| 4 | * |
|---|
| 5 | * This work is provide AS IS, and has no warranty. |
|---|
| 6 | * The author is NOT responsible for anything that happens |
|---|
| 7 | * due to use of this code, either using it or running it on |
|---|
| 8 | * your system |
|---|
| 9 | */ |
|---|
| 10 | #ifdef __SSE2__ |
|---|
| 11 | #include <xmmintrin.h> |
|---|
| 12 | #include <emmintrin.h> |
|---|
| 13 | #endif |
|---|
| 14 | |
|---|
| 15 | #include <stdlib.h> |
|---|
| 16 | #include <stdio.h> |
|---|
| 17 | #include <string.h> |
|---|
| 18 | |
|---|
| 19 | #include "types.h" |
|---|
| 20 | #include "helpers.h" |
|---|
| 21 | #include "block.h" |
|---|
| 22 | |
|---|
| 23 | |
|---|
| 24 | void cycle_block( llife_block block, |
|---|
| 25 | llife_block tl, llife_block t, llife_block tr, |
|---|
| 26 | llife_block l, llife_block r, |
|---|
| 27 | llife_block bl, llife_block b, llife_block br ) { |
|---|
| 28 | int i,j; |
|---|
| 29 | |
|---|
| 30 | //const __m128i onemask = _mm_set1_epi8( 0x01 ); |
|---|
| 31 | |
|---|
| 32 | const __m128i populate = _mm_set1_epi8( 3 ); |
|---|
| 33 | __m128i top_left, top, top_right; |
|---|
| 34 | __m128i left, self, right; |
|---|
| 35 | __m128i bot_left, bot, bot_right; |
|---|
| 36 | __m128i itemp; |
|---|
| 37 | |
|---|
| 38 | top_left = top = top_right = self = _mm_setzero_si128(); |
|---|
| 39 | |
|---|
| 40 | if( t ) { |
|---|
| 41 | self = _mm_load_si128( (*t->current) + 127 ); |
|---|
| 42 | left = shift_pack_right( self ); |
|---|
| 43 | right = shift_pack_left( self ); |
|---|
| 44 | } |
|---|
| 45 | |
|---|
| 46 | if( tl ) { |
|---|
| 47 | itemp = _mm_load_si128( (*tl->current) + 127 ); |
|---|
| 48 | left = _mm_or_si128( shift_pack_left127(itemp ), left ); |
|---|
| 49 | } |
|---|
| 50 | |
|---|
| 51 | if( tr ) { |
|---|
| 52 | itemp = _mm_load_si128( (*tr->current) + 127 ); |
|---|
| 53 | right = _mm_or_si128( shift_pack_right127( itemp ), right ); |
|---|
| 54 | } |
|---|
| 55 | |
|---|
| 56 | |
|---|
| 57 | |
|---|
| 58 | bot = _mm_load_si128( (*block->current) + 0 ); |
|---|
| 59 | bot_left = shift_pack_right( bot ); |
|---|
| 60 | bot_right = shift_pack_left( bot ); |
|---|
| 61 | |
|---|
| 62 | |
|---|
| 63 | if( l ) { |
|---|
| 64 | itemp = _mm_load_si128( (*l->current) + 0 ); |
|---|
| 65 | bot_left = _mm_or_si128( shift_pack_left127( itemp ), bot_left ); |
|---|
| 66 | } |
|---|
| 67 | |
|---|
| 68 | if( r ) { |
|---|
| 69 | itemp = _mm_load_si128( (*r->current) + 0 ); |
|---|
| 70 | bot_right = _mm_or_si128( shift_pack_right127( itemp ), bot_right ); |
|---|
| 71 | } |
|---|
| 72 | |
|---|
| 73 | for( i = 0; i < 128; i++ ) { |
|---|
| 74 | |
|---|
| 75 | top_left = left; top = self; top_right = right; |
|---|
| 76 | left = bot_left; self = bot; right = bot_right; |
|---|
| 77 | |
|---|
| 78 | if( i < 127 ) { |
|---|
| 79 | bot = _mm_load_si128( (*block->current) + i + 1 ); |
|---|
| 80 | bot_left = shift_pack_right( bot ); |
|---|
| 81 | bot_right = shift_pack_left( bot ); |
|---|
| 82 | if( l ) { |
|---|
| 83 | itemp = _mm_load_si128( (*l->current) + i + 1 ); |
|---|
| 84 | bot_left = _mm_or_si128( shift_pack_left127( itemp ), bot_left ); |
|---|
| 85 | } |
|---|
| 86 | |
|---|
| 87 | if( r ) { |
|---|
| 88 | itemp = _mm_load_si128( (*r->current) + i + 1 ); |
|---|
| 89 | bot_right = _mm_or_si128( shift_pack_right127( itemp ), bot_right ); |
|---|
| 90 | } |
|---|
| 91 | } else { |
|---|
| 92 | bot = _mm_setzero_si128(); |
|---|
| 93 | if( b ) { |
|---|
| 94 | bot = (*b->current)[0]; |
|---|
| 95 | bot_left = shift_pack_right( bot ); |
|---|
| 96 | bot_right = shift_pack_left( bot ); |
|---|
| 97 | } |
|---|
| 98 | |
|---|
| 99 | if( bl ) { |
|---|
| 100 | bot_left = _mm_or_si128( shift_pack_left127( (*bl->current)[0] ), bot_left ); |
|---|
| 101 | } |
|---|
| 102 | |
|---|
| 103 | if( br ) { |
|---|
| 104 | bot_right = _mm_or_si128( shift_pack_right127( (*br->current)[0] ), bot_right ); |
|---|
| 105 | } |
|---|
| 106 | } |
|---|
| 107 | |
|---|
| 108 | __m128i temp = _mm_setzero_si128(); |
|---|
| 109 | for( j = 0; j < 8; j++ ) { |
|---|
| 110 | __m128i _top_left = unpack(top_left, j); |
|---|
| 111 | __m128i _top = unpack( top, j ); |
|---|
| 112 | __m128i sum1 = _mm_add_epi8( _top_left, _top ); |
|---|
| 113 | |
|---|
| 114 | __m128i _top_right = unpack( top_right, j ); |
|---|
| 115 | __m128i _right = unpack ( right, j ); |
|---|
| 116 | __m128i sum2 = _mm_add_epi8( _top_right, _right ); |
|---|
| 117 | sum1 = _mm_add_epi8( sum1, sum2 ); |
|---|
| 118 | |
|---|
| 119 | __m128i _bot = unpack( bot, j ); |
|---|
| 120 | __m128i _bot_right = unpack( bot_right, j ); |
|---|
| 121 | __m128i sum4 = _mm_add_epi8( _bot, _bot_right ); |
|---|
| 122 | |
|---|
| 123 | __m128i _bot_left = unpack( bot_left, j ); |
|---|
| 124 | __m128i _left = unpack( left, j ); |
|---|
| 125 | __m128i sum3 = _mm_add_epi8( _left, _bot_left ); |
|---|
| 126 | |
|---|
| 127 | sum3 = _mm_add_epi8( sum3, sum4 ); |
|---|
| 128 | |
|---|
| 129 | |
|---|
| 130 | sum3 = _mm_add_epi8( sum1, sum3 ); |
|---|
| 131 | |
|---|
| 132 | sum1 = _mm_cmpeq_epi8( sum3, populate ); |
|---|
| 133 | |
|---|
| 134 | __m128i _self = unpack( self, j ); |
|---|
| 135 | sum3 = _mm_add_epi8( _self, sum3 ); |
|---|
| 136 | |
|---|
| 137 | sum3 = _mm_cmpeq_epi8( sum3, populate ); |
|---|
| 138 | |
|---|
| 139 | sum3 = _mm_or_si128( sum1, sum3 ); |
|---|
| 140 | |
|---|
| 141 | |
|---|
| 142 | temp = _mm_or_si128( temp, pack( sum3, j ) ); |
|---|
| 143 | //temp = _mm_or_si128( pack( unpack( bot_right, j ), j ), temp ); |
|---|
| 144 | } |
|---|
| 145 | _mm_store_si128( (*block->working) + i, temp ); |
|---|
| 146 | } |
|---|
| 147 | } |
|---|
| 148 | |
|---|
| 149 | void worldCycle( llife_world world ) { |
|---|
| 150 | int i, j; |
|---|
| 151 | //#pragma omp parallel private(i) |
|---|
| 152 | //#pragma omp for schedule(static) |
|---|
| 153 | for( i = 0; i < world->width; i++ ) { |
|---|
| 154 | for( j = 0; j < world->height; j++ ) { |
|---|
| 155 | llife_block center = get_life_block( world, j, i); |
|---|
| 156 | if( center ) { |
|---|
| 157 | llife_block tl, t, tr, l, r, bl, b, br; |
|---|
| 158 | |
|---|
| 159 | tl = i > 0 && j > 0 ? get_life_block( world, j-1, i-1 ) : NULL; |
|---|
| 160 | tr = i < world->width-1 && j > 0 ? get_life_block( world, j-1, i+1 ) : NULL; |
|---|
| 161 | t = j > 0 ? get_life_block( world, j-1, i ) : NULL; |
|---|
| 162 | |
|---|
| 163 | l = i > 0 ? get_life_block( world, j, i-1 ) : NULL; |
|---|
| 164 | r = i < world->width-1 ? get_life_block( world, j, i+1 ) : NULL; |
|---|
| 165 | |
|---|
| 166 | bl = i > 0 && j < world->height-1 ? get_life_block( world, j+1, i-1 ) : NULL; |
|---|
| 167 | br = i < world->width-1 && j < world->height-1 ? get_life_block( world, j+1, i+1 ) : NULL; |
|---|
| 168 | b = j < world->height-1 ? get_life_block( world, j+1, i ) : NULL; |
|---|
| 169 | |
|---|
| 170 | |
|---|
| 171 | cycle_block( center, tl, t, tr, l, r, bl, b, br ); |
|---|
| 172 | } |
|---|
| 173 | } |
|---|
| 174 | } |
|---|
| 175 | |
|---|
| 176 | for( i = 0; i < world->width * world->height; i++ ) { |
|---|
| 177 | |
|---|
| 178 | llife_block center = world->blocks[i]; |
|---|
| 179 | data_block * mtemp = center->current; |
|---|
| 180 | center->current = center->working; |
|---|
| 181 | center->working = mtemp; |
|---|
| 182 | } |
|---|
| 183 | } |
|---|
| 184 | |
|---|
| 185 | //returns NULL if fail; |
|---|
| 186 | life_block *allocate_block() { |
|---|
| 187 | //We're going to do two allocs so our data is alligned |
|---|
| 188 | llife_block lb = calloc( 1, sizeof( life_block ) ); |
|---|
| 189 | if( !lb ) { |
|---|
| 190 | return 0; |
|---|
| 191 | } |
|---|
| 192 | //Allocate both of the buffers in 1 chunk. It will make life easier |
|---|
| 193 | |
|---|
| 194 | lb->allocated = (data_block*)calloc( 1, sizeof( data_block ) * 2 + 0x7F );// calloc( 2, sizeof( __m128i ) * 128 ); |
|---|
| 195 | if( !lb->allocated ) { |
|---|
| 196 | free( lb ); |
|---|
| 197 | return 0; |
|---|
| 198 | } |
|---|
| 199 | lb->current = (data_block*)((((unsigned long)lb->allocated) + 0x7F) & ~0x7F); |
|---|
| 200 | lb->working = &(lb->current)[1]; |
|---|
| 201 | return lb; |
|---|
| 202 | } |
|---|
| 203 | |
|---|
| 204 | void free_block( life_block *block ) { |
|---|
| 205 | if( !block ) { |
|---|
| 206 | return; |
|---|
| 207 | } |
|---|
| 208 | free( block->allocated ); |
|---|
| 209 | free( block ); |
|---|
| 210 | } |
|---|
| 211 | |
|---|
| 212 | llife_world allocate_world( int height, int width ) { |
|---|
| 213 | |
|---|
| 214 | llife_world world = malloc( sizeof( life_world ) ); |
|---|
| 215 | world->blocks = calloc( width * height, sizeof( llife_block ) ); |
|---|
| 216 | |
|---|
| 217 | int i; |
|---|
| 218 | for( i = 0; i < width * height; i++ ) { |
|---|
| 219 | llife_block blk = allocate_block(); |
|---|
| 220 | if( !blk ) { |
|---|
| 221 | free_world( world ); |
|---|
| 222 | return 0; |
|---|
| 223 | } |
|---|
| 224 | world->blocks[i] = blk; |
|---|
| 225 | } |
|---|
| 226 | |
|---|
| 227 | world->width = width; |
|---|
| 228 | world->height = height; |
|---|
| 229 | world->origin_x = world->origin_y = 0; |
|---|
| 230 | return world; |
|---|
| 231 | } |
|---|
| 232 | void free_world( llife_world world ) { |
|---|
| 233 | |
|---|
| 234 | if( !world ) { |
|---|
| 235 | return; |
|---|
| 236 | } |
|---|
| 237 | int i; |
|---|
| 238 | for( i = 0; i < world->height * world->width; i++ ) { |
|---|
| 239 | free_block( world->blocks[i] ); |
|---|
| 240 | } |
|---|
| 241 | free( world->blocks ); |
|---|
| 242 | } |
|---|
| 243 | |
|---|
| 244 | void set_bit_in_world( llife_world world, int row, int column ) { |
|---|
| 245 | int block_column = column / 128; |
|---|
| 246 | int block_row = row / BLOCK_HEIGHT; |
|---|
| 247 | |
|---|
| 248 | int inner_column = column % 128; |
|---|
| 249 | int inner_row = row % BLOCK_HEIGHT; |
|---|
| 250 | |
|---|
| 251 | llife_block blk = get_life_block( world, block_row, block_column ); |
|---|
| 252 | |
|---|
| 253 | |
|---|
| 254 | set_bit_in_block( blk, inner_row, inner_column ); |
|---|
| 255 | } |
|---|
| 256 | // XX d3 d2 a3 a2 a1 b3 b2 b1 c3 c2 c1 |
|---|
| 257 | // a3 a2 a1 b3 b2 b1 c3 c2 d1 d3 d2 d1 |
|---|
| 258 | // b3 b2 b1 c3 c2 c1 d3 d2 d1 a2 a1 XX |
|---|
| 259 | |
|---|
| 260 | |
|---|