12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457 |
- // SPDX-License-Identifier: Apache-2.0
- // ----------------------------------------------------------------------------
- // Copyright 2011-2024 Arm Limited
- //
- // Licensed under the Apache License, Version 2.0 (the "License"); you may not
- // use this file except in compliance with the License. You may obtain a copy
- // of the License at:
- //
- // http://www.apache.org/licenses/LICENSE-2.0
- //
- // Unless required by applicable law or agreed to in writing, software
- // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- // License for the specific language governing permissions and limitations
- // under the License.
- // ----------------------------------------------------------------------------
- #if !defined(ASTCENC_DECOMPRESS_ONLY)
- /**
- * @brief Functions to compress a symbolic block.
- */
- #include "astcenc_internal.h"
- #include "astcenc_diagnostic_trace.h"
- #include <cassert>
- /**
- * @brief Merge two planes of endpoints into a single vector.
- *
- * @param ep_plane1 The endpoints for plane 1.
- * @param ep_plane2 The endpoints for plane 2.
- * @param component_plane2 The color component for plane 2.
- * @param[out] result The merged output.
- */
- static void merge_endpoints(
- const endpoints& ep_plane1,
- const endpoints& ep_plane2,
- unsigned int component_plane2,
- endpoints& result
- ) {
- unsigned int partition_count = ep_plane1.partition_count;
- assert(partition_count == 1);
- vmask4 sep_mask = vint4::lane_id() == vint4(component_plane2);
- result.partition_count = partition_count;
- result.endpt0[0] = select(ep_plane1.endpt0[0], ep_plane2.endpt0[0], sep_mask);
- result.endpt1[0] = select(ep_plane1.endpt1[0], ep_plane2.endpt1[0], sep_mask);
- }
- /**
- * @brief Attempt to improve weights given a chosen configuration.
- *
- * Given a fixed weight grid decimation and weight value quantization, iterate over all weights (per
- * partition and per plane) and attempt to improve image quality by moving each weight up by one or
- * down by one quantization step.
- *
- * This is a specialized function which only supports operating on undecimated weight grids,
- * therefore primarily improving the performance of 4x4 and 5x5 blocks where grid decimation
- * is needed less often.
- *
- * @param decode_mode The decode mode (LDR, HDR).
- * @param bsd The block size information.
- * @param blk The image block color data to compress.
- * @param[out] scb The symbolic compressed block output.
- */
- static bool realign_weights_undecimated(
- astcenc_profile decode_mode,
- const block_size_descriptor& bsd,
- const image_block& blk,
- symbolic_compressed_block& scb
- ) {
- // Get the partition descriptor
- unsigned int partition_count = scb.partition_count;
- const auto& pi = bsd.get_partition_info(partition_count, scb.partition_index);
- // Get the quantization table
- const block_mode& bm = bsd.get_block_mode(scb.block_mode);
- unsigned int weight_quant_level = bm.quant_mode;
- const quant_and_transfer_table& qat = quant_and_xfer_tables[weight_quant_level];
- unsigned int max_plane = bm.is_dual_plane;
- int plane2_component = scb.plane2_component;
- vmask4 plane_mask = vint4::lane_id() == vint4(plane2_component);
- // Decode the color endpoints
- bool rgb_hdr;
- bool alpha_hdr;
- vint4 endpnt0[BLOCK_MAX_PARTITIONS];
- vint4 endpnt1[BLOCK_MAX_PARTITIONS];
- vfloat4 endpnt0f[BLOCK_MAX_PARTITIONS];
- vfloat4 offset[BLOCK_MAX_PARTITIONS];
- promise(partition_count > 0);
- for (unsigned int pa_idx = 0; pa_idx < partition_count; pa_idx++)
- {
- unpack_color_endpoints(decode_mode,
- scb.color_formats[pa_idx],
- scb.color_values[pa_idx],
- rgb_hdr, alpha_hdr,
- endpnt0[pa_idx],
- endpnt1[pa_idx]);
- }
- uint8_t* dec_weights_uquant = scb.weights;
- bool adjustments = false;
- // For each plane and partition ...
- for (unsigned int pl_idx = 0; pl_idx <= max_plane; pl_idx++)
- {
- for (unsigned int pa_idx = 0; pa_idx < partition_count; pa_idx++)
- {
- // Compute the endpoint delta for all components in current plane
- vint4 epd = endpnt1[pa_idx] - endpnt0[pa_idx];
- epd = select(epd, vint4::zero(), plane_mask);
- endpnt0f[pa_idx] = int_to_float(endpnt0[pa_idx]);
- offset[pa_idx] = int_to_float(epd) * (1.0f / 64.0f);
- }
- // For each weight compute previous, current, and next errors
- promise(bsd.texel_count > 0);
- for (unsigned int texel = 0; texel < bsd.texel_count; texel++)
- {
- int uqw = dec_weights_uquant[texel];
- uint32_t prev_and_next = qat.prev_next_values[uqw];
- int uqw_down = prev_and_next & 0xFF;
- int uqw_up = (prev_and_next >> 8) & 0xFF;
- // Interpolate the colors to create the diffs
- float weight_base = static_cast<float>(uqw);
- float weight_down = static_cast<float>(uqw_down - uqw);
- float weight_up = static_cast<float>(uqw_up - uqw);
- unsigned int partition = pi.partition_of_texel[texel];
- vfloat4 color_offset = offset[partition];
- vfloat4 color_base = endpnt0f[partition];
- vfloat4 color = color_base + color_offset * weight_base;
- vfloat4 orig_color = blk.texel(texel);
- vfloat4 error_weight = blk.channel_weight;
- vfloat4 color_diff = color - orig_color;
- vfloat4 color_diff_down = color_diff + color_offset * weight_down;
- vfloat4 color_diff_up = color_diff + color_offset * weight_up;
- float error_base = dot_s(color_diff * color_diff, error_weight);
- float error_down = dot_s(color_diff_down * color_diff_down, error_weight);
- float error_up = dot_s(color_diff_up * color_diff_up, error_weight);
- // Check if the prev or next error is better, and if so use it
- if ((error_up < error_base) && (error_up < error_down) && (uqw < 64))
- {
- dec_weights_uquant[texel] = static_cast<uint8_t>(uqw_up);
- adjustments = true;
- }
- else if ((error_down < error_base) && (uqw > 0))
- {
- dec_weights_uquant[texel] = static_cast<uint8_t>(uqw_down);
- adjustments = true;
- }
- }
- // Prepare iteration for plane 2
- dec_weights_uquant += WEIGHTS_PLANE2_OFFSET;
- plane_mask = ~plane_mask;
- }
- return adjustments;
- }
- /**
- * @brief Attempt to improve weights given a chosen configuration.
- *
- * Given a fixed weight grid decimation and weight value quantization, iterate over all weights (per
- * partition and per plane) and attempt to improve image quality by moving each weight up by one or
- * down by one quantization step.
- *
- * @param decode_mode The decode mode (LDR, HDR).
- * @param bsd The block size information.
- * @param blk The image block color data to compress.
- * @param[out] scb The symbolic compressed block output.
- */
- static bool realign_weights_decimated(
- astcenc_profile decode_mode,
- const block_size_descriptor& bsd,
- const image_block& blk,
- symbolic_compressed_block& scb
- ) {
- // Get the partition descriptor
- unsigned int partition_count = scb.partition_count;
- const auto& pi = bsd.get_partition_info(partition_count, scb.partition_index);
- // Get the quantization table
- const block_mode& bm = bsd.get_block_mode(scb.block_mode);
- unsigned int weight_quant_level = bm.quant_mode;
- const quant_and_transfer_table& qat = quant_and_xfer_tables[weight_quant_level];
- // Get the decimation table
- const decimation_info& di = bsd.get_decimation_info(bm.decimation_mode);
- unsigned int weight_count = di.weight_count;
- assert(weight_count != bsd.texel_count);
- unsigned int max_plane = bm.is_dual_plane;
- int plane2_component = scb.plane2_component;
- vmask4 plane_mask = vint4::lane_id() == vint4(plane2_component);
- // Decode the color endpoints
- bool rgb_hdr;
- bool alpha_hdr;
- vint4 endpnt0[BLOCK_MAX_PARTITIONS];
- vint4 endpnt1[BLOCK_MAX_PARTITIONS];
- vfloat4 endpnt0f[BLOCK_MAX_PARTITIONS];
- vfloat4 offset[BLOCK_MAX_PARTITIONS];
- promise(partition_count > 0);
- promise(weight_count > 0);
- for (unsigned int pa_idx = 0; pa_idx < partition_count; pa_idx++)
- {
- unpack_color_endpoints(decode_mode,
- scb.color_formats[pa_idx],
- scb.color_values[pa_idx],
- rgb_hdr, alpha_hdr,
- endpnt0[pa_idx],
- endpnt1[pa_idx]);
- }
- uint8_t* dec_weights_uquant = scb.weights;
- bool adjustments = false;
- // For each plane and partition ...
- for (unsigned int pl_idx = 0; pl_idx <= max_plane; pl_idx++)
- {
- for (unsigned int pa_idx = 0; pa_idx < partition_count; pa_idx++)
- {
- // Compute the endpoint delta for all components in current plane
- vint4 epd = endpnt1[pa_idx] - endpnt0[pa_idx];
- epd = select(epd, vint4::zero(), plane_mask);
- endpnt0f[pa_idx] = int_to_float(endpnt0[pa_idx]);
- offset[pa_idx] = int_to_float(epd) * (1.0f / 64.0f);
- }
- // Create an unquantized weight grid for this decimation level
- ASTCENC_ALIGNAS float uq_weightsf[BLOCK_MAX_WEIGHTS];
- for (unsigned int we_idx = 0; we_idx < weight_count; we_idx += ASTCENC_SIMD_WIDTH)
- {
- vint unquant_value(dec_weights_uquant + we_idx);
- vfloat unquant_valuef = int_to_float(unquant_value);
- storea(unquant_valuef, uq_weightsf + we_idx);
- }
- // For each weight compute previous, current, and next errors
- for (unsigned int we_idx = 0; we_idx < weight_count; we_idx++)
- {
- int uqw = dec_weights_uquant[we_idx];
- uint32_t prev_and_next = qat.prev_next_values[uqw];
- float uqw_base = uq_weightsf[we_idx];
- float uqw_down = static_cast<float>(prev_and_next & 0xFF);
- float uqw_up = static_cast<float>((prev_and_next >> 8) & 0xFF);
- float uqw_diff_down = uqw_down - uqw_base;
- float uqw_diff_up = uqw_up - uqw_base;
- vfloat4 error_basev = vfloat4::zero();
- vfloat4 error_downv = vfloat4::zero();
- vfloat4 error_upv = vfloat4::zero();
- // Interpolate the colors to create the diffs
- unsigned int texels_to_evaluate = di.weight_texel_count[we_idx];
- promise(texels_to_evaluate > 0);
- for (unsigned int te_idx = 0; te_idx < texels_to_evaluate; te_idx++)
- {
- unsigned int texel = di.weight_texels_tr[te_idx][we_idx];
- float tw_base = di.texel_contrib_for_weight[te_idx][we_idx];
- float weight_base = (uq_weightsf[di.texel_weights_tr[0][texel]] * di.texel_weight_contribs_float_tr[0][texel]
- + uq_weightsf[di.texel_weights_tr[1][texel]] * di.texel_weight_contribs_float_tr[1][texel])
- + (uq_weightsf[di.texel_weights_tr[2][texel]] * di.texel_weight_contribs_float_tr[2][texel]
- + uq_weightsf[di.texel_weights_tr[3][texel]] * di.texel_weight_contribs_float_tr[3][texel]);
- // Ideally this is integer rounded, but IQ gain it isn't worth the overhead
- // float weight = astc::flt_rd(weight_base + 0.5f);
- // float weight_down = astc::flt_rd(weight_base + 0.5f + uqw_diff_down * tw_base) - weight;
- // float weight_up = astc::flt_rd(weight_base + 0.5f + uqw_diff_up * tw_base) - weight;
- float weight_down = weight_base + uqw_diff_down * tw_base - weight_base;
- float weight_up = weight_base + uqw_diff_up * tw_base - weight_base;
- unsigned int partition = pi.partition_of_texel[texel];
- vfloat4 color_offset = offset[partition];
- vfloat4 color_base = endpnt0f[partition];
- vfloat4 color = color_base + color_offset * weight_base;
- vfloat4 orig_color = blk.texel(texel);
- vfloat4 color_diff = color - orig_color;
- vfloat4 color_down_diff = color_diff + color_offset * weight_down;
- vfloat4 color_up_diff = color_diff + color_offset * weight_up;
- error_basev += color_diff * color_diff;
- error_downv += color_down_diff * color_down_diff;
- error_upv += color_up_diff * color_up_diff;
- }
- vfloat4 error_weight = blk.channel_weight;
- float error_base = hadd_s(error_basev * error_weight);
- float error_down = hadd_s(error_downv * error_weight);
- float error_up = hadd_s(error_upv * error_weight);
- // Check if the prev or next error is better, and if so use it
- if ((error_up < error_base) && (error_up < error_down) && (uqw < 64))
- {
- uq_weightsf[we_idx] = uqw_up;
- dec_weights_uquant[we_idx] = static_cast<uint8_t>(uqw_up);
- adjustments = true;
- }
- else if ((error_down < error_base) && (uqw > 0))
- {
- uq_weightsf[we_idx] = uqw_down;
- dec_weights_uquant[we_idx] = static_cast<uint8_t>(uqw_down);
- adjustments = true;
- }
- }
- // Prepare iteration for plane 2
- dec_weights_uquant += WEIGHTS_PLANE2_OFFSET;
- plane_mask = ~plane_mask;
- }
- return adjustments;
- }
- /**
- * @brief Compress a block using a chosen partitioning and 1 plane of weights.
- *
- * @param config The compressor configuration.
- * @param bsd The block size information.
- * @param blk The image block color data to compress.
- * @param only_always True if we only use "always" percentile block modes.
- * @param tune_errorval_threshold The error value threshold.
- * @param partition_count The partition count.
- * @param partition_index The partition index if @c partition_count is 2-4.
- * @param[out] scb The symbolic compressed block output.
- * @param[out] tmpbuf The quantized weights for plane 1.
- */
- static float compress_symbolic_block_for_partition_1plane(
- const astcenc_config& config,
- const block_size_descriptor& bsd,
- const image_block& blk,
- bool only_always,
- float tune_errorval_threshold,
- unsigned int partition_count,
- unsigned int partition_index,
- symbolic_compressed_block& scb,
- compression_working_buffers& tmpbuf,
- int quant_limit
- ) {
- promise(partition_count > 0);
- promise(config.tune_candidate_limit > 0);
- promise(config.tune_refinement_limit > 0);
- int max_weight_quant = astc::min(static_cast<int>(QUANT_32), quant_limit);
- auto compute_difference = &compute_symbolic_block_difference_1plane;
- if ((partition_count == 1) && !(config.flags & ASTCENC_FLG_MAP_RGBM))
- {
- compute_difference = &compute_symbolic_block_difference_1plane_1partition;
- }
- const auto& pi = bsd.get_partition_info(partition_count, partition_index);
- // Compute ideal weights and endpoint colors, with no quantization or decimation
- endpoints_and_weights& ei = tmpbuf.ei1;
- compute_ideal_colors_and_weights_1plane(blk, pi, ei);
- // Compute ideal weights and endpoint colors for every decimation
- float* dec_weights_ideal = tmpbuf.dec_weights_ideal;
- uint8_t* dec_weights_uquant = tmpbuf.dec_weights_uquant;
- // For each decimation mode, compute an ideal set of weights with no quantization
- unsigned int max_decimation_modes = only_always ? bsd.decimation_mode_count_always
- : bsd.decimation_mode_count_selected;
- promise(max_decimation_modes > 0);
- for (unsigned int i = 0; i < max_decimation_modes; i++)
- {
- const auto& dm = bsd.get_decimation_mode(i);
- if (!dm.is_ref_1plane(static_cast<quant_method>(max_weight_quant)))
- {
- continue;
- }
- const auto& di = bsd.get_decimation_info(i);
- compute_ideal_weights_for_decimation(
- ei,
- di,
- dec_weights_ideal + i * BLOCK_MAX_WEIGHTS);
- }
- // Compute maximum colors for the endpoints and ideal weights, then for each endpoint and ideal
- // weight pair, compute the smallest weight that will result in a color value greater than 1
- vfloat4 min_ep(10.0f);
- for (unsigned int i = 0; i < partition_count; i++)
- {
- vfloat4 ep = (vfloat4(1.0f) - ei.ep.endpt0[i]) / (ei.ep.endpt1[i] - ei.ep.endpt0[i]);
- vmask4 use_ep = (ep > vfloat4(0.5f)) & (ep < min_ep);
- min_ep = select(min_ep, ep, use_ep);
- }
- float min_wt_cutoff = hmin_s(min_ep);
- // For each mode, use the angular method to compute a shift
- compute_angular_endpoints_1plane(
- only_always, bsd, dec_weights_ideal, max_weight_quant, tmpbuf);
- float* weight_low_value = tmpbuf.weight_low_value1;
- float* weight_high_value = tmpbuf.weight_high_value1;
- int8_t* qwt_bitcounts = tmpbuf.qwt_bitcounts;
- float* qwt_errors = tmpbuf.qwt_errors;
- // For each mode (which specifies a decimation and a quantization):
- // * Compute number of bits needed for the quantized weights
- // * Generate an optimized set of quantized weights
- // * Compute quantization errors for the mode
- static const int8_t free_bits_for_partition_count[4] {
- 115 - 4, 111 - 4 - PARTITION_INDEX_BITS, 108 - 4 - PARTITION_INDEX_BITS, 105 - 4 - PARTITION_INDEX_BITS
- };
- unsigned int max_block_modes = only_always ? bsd.block_mode_count_1plane_always
- : bsd.block_mode_count_1plane_selected;
- promise(max_block_modes > 0);
- for (unsigned int i = 0; i < max_block_modes; i++)
- {
- const block_mode& bm = bsd.block_modes[i];
- if (bm.quant_mode > max_weight_quant)
- {
- qwt_errors[i] = 1e38f;
- continue;
- }
- assert(!bm.is_dual_plane);
- int bitcount = free_bits_for_partition_count[partition_count - 1] - bm.weight_bits;
- if (bitcount <= 0)
- {
- qwt_errors[i] = 1e38f;
- continue;
- }
- if (weight_high_value[i] > 1.02f * min_wt_cutoff)
- {
- weight_high_value[i] = 1.0f;
- }
- int decimation_mode = bm.decimation_mode;
- const auto& di = bsd.get_decimation_info(decimation_mode);
- qwt_bitcounts[i] = static_cast<int8_t>(bitcount);
- ASTCENC_ALIGNAS float dec_weights_uquantf[BLOCK_MAX_WEIGHTS];
- // Generate the optimized set of weights for the weight mode
- compute_quantized_weights_for_decimation(
- di,
- weight_low_value[i], weight_high_value[i],
- dec_weights_ideal + BLOCK_MAX_WEIGHTS * decimation_mode,
- dec_weights_uquantf,
- dec_weights_uquant + BLOCK_MAX_WEIGHTS * i,
- bm.get_weight_quant_mode());
- // Compute weight quantization errors for the block mode
- qwt_errors[i] = compute_error_of_weight_set_1plane(
- ei,
- di,
- dec_weights_uquantf);
- }
- // Decide the optimal combination of color endpoint encodings and weight encodings
- uint8_t partition_format_specifiers[TUNE_MAX_TRIAL_CANDIDATES][BLOCK_MAX_PARTITIONS];
- int block_mode_index[TUNE_MAX_TRIAL_CANDIDATES];
- quant_method color_quant_level[TUNE_MAX_TRIAL_CANDIDATES];
- quant_method color_quant_level_mod[TUNE_MAX_TRIAL_CANDIDATES];
- unsigned int candidate_count = compute_ideal_endpoint_formats(
- pi, blk, ei.ep, qwt_bitcounts, qwt_errors,
- config.tune_candidate_limit, 0, max_block_modes,
- partition_format_specifiers, block_mode_index,
- color_quant_level, color_quant_level_mod, tmpbuf);
- // Iterate over the N believed-to-be-best modes to find out which one is actually best
- float best_errorval_in_mode = ERROR_CALC_DEFAULT;
- float best_errorval_in_scb = scb.errorval;
- for (unsigned int i = 0; i < candidate_count; i++)
- {
- TRACE_NODE(node0, "candidate");
- const int bm_packed_index = block_mode_index[i];
- assert(bm_packed_index >= 0 && bm_packed_index < static_cast<int>(bsd.block_mode_count_1plane_selected));
- const block_mode& qw_bm = bsd.block_modes[bm_packed_index];
- int decimation_mode = qw_bm.decimation_mode;
- const auto& di = bsd.get_decimation_info(decimation_mode);
- promise(di.weight_count > 0);
- trace_add_data("weight_x", di.weight_x);
- trace_add_data("weight_y", di.weight_y);
- trace_add_data("weight_z", di.weight_z);
- trace_add_data("weight_quant", qw_bm.quant_mode);
- // Recompute the ideal color endpoints before storing them
- vfloat4 rgbs_colors[BLOCK_MAX_PARTITIONS];
- vfloat4 rgbo_colors[BLOCK_MAX_PARTITIONS];
- symbolic_compressed_block workscb;
- endpoints workep = ei.ep;
- uint8_t* u8_weight_src = dec_weights_uquant + BLOCK_MAX_WEIGHTS * bm_packed_index;
- for (unsigned int j = 0; j < di.weight_count; j++)
- {
- workscb.weights[j] = u8_weight_src[j];
- }
- for (unsigned int l = 0; l < config.tune_refinement_limit; l++)
- {
- recompute_ideal_colors_1plane(
- blk, pi, di, workscb.weights,
- workep, rgbs_colors, rgbo_colors);
- // Quantize the chosen color, tracking if worth trying the mod value
- bool all_same = color_quant_level[i] != color_quant_level_mod[i];
- for (unsigned int j = 0; j < partition_count; j++)
- {
- workscb.color_formats[j] = pack_color_endpoints(
- workep.endpt0[j],
- workep.endpt1[j],
- rgbs_colors[j],
- rgbo_colors[j],
- partition_format_specifiers[i][j],
- workscb.color_values[j],
- color_quant_level[i]);
- all_same = all_same && workscb.color_formats[j] == workscb.color_formats[0];
- }
- // If all the color endpoint modes are the same, we get a few more bits to store colors;
- // let's see if we can take advantage of this: requantize all the colors and see if the
- // endpoint modes remain the same.
- workscb.color_formats_matched = 0;
- if (partition_count >= 2 && all_same)
- {
- uint8_t colorvals[BLOCK_MAX_PARTITIONS][8];
- uint8_t color_formats_mod[BLOCK_MAX_PARTITIONS] { 0 };
- bool all_same_mod = true;
- for (unsigned int j = 0; j < partition_count; j++)
- {
- color_formats_mod[j] = pack_color_endpoints(
- workep.endpt0[j],
- workep.endpt1[j],
- rgbs_colors[j],
- rgbo_colors[j],
- partition_format_specifiers[i][j],
- colorvals[j],
- color_quant_level_mod[i]);
- // Early out as soon as it's no longer possible to use mod
- if (color_formats_mod[j] != color_formats_mod[0])
- {
- all_same_mod = false;
- break;
- }
- }
- if (all_same_mod)
- {
- workscb.color_formats_matched = 1;
- for (unsigned int j = 0; j < BLOCK_MAX_PARTITIONS; j++)
- {
- for (unsigned int k = 0; k < 8; k++)
- {
- workscb.color_values[j][k] = colorvals[j][k];
- }
- workscb.color_formats[j] = color_formats_mod[j];
- }
- }
- }
- // Store header fields
- workscb.partition_count = static_cast<uint8_t>(partition_count);
- workscb.partition_index = static_cast<uint16_t>(partition_index);
- workscb.plane2_component = -1;
- workscb.quant_mode = workscb.color_formats_matched ? color_quant_level_mod[i] : color_quant_level[i];
- workscb.block_mode = qw_bm.mode_index;
- workscb.block_type = SYM_BTYPE_NONCONST;
- // Pre-realign test
- if (l == 0)
- {
- float errorval = compute_difference(config, bsd, workscb, blk);
- if (errorval == -ERROR_CALC_DEFAULT)
- {
- errorval = -errorval;
- workscb.block_type = SYM_BTYPE_ERROR;
- }
- trace_add_data("error_prerealign", errorval);
- best_errorval_in_mode = astc::min(errorval, best_errorval_in_mode);
- // Average refinement improvement is 3.5% per iteration (allow 4.5%), but the first
- // iteration can help more so we give it a extra 8% leeway. Use this knowledge to
- // drive a heuristic to skip blocks that are unlikely to catch up with the best
- // block we have already.
- unsigned int iters_remaining = config.tune_refinement_limit - l;
- float threshold = (0.045f * static_cast<float>(iters_remaining)) + 1.08f;
- if (errorval > (threshold * best_errorval_in_scb))
- {
- break;
- }
- if (errorval < best_errorval_in_scb)
- {
- best_errorval_in_scb = errorval;
- workscb.errorval = errorval;
- scb = workscb;
- if (errorval < tune_errorval_threshold)
- {
- // Skip remaining candidates - this is "good enough"
- i = candidate_count;
- break;
- }
- }
- }
- bool adjustments;
- if (di.weight_count != bsd.texel_count)
- {
- adjustments = realign_weights_decimated(
- config.profile, bsd, blk, workscb);
- }
- else
- {
- adjustments = realign_weights_undecimated(
- config.profile, bsd, blk, workscb);
- }
- // Post-realign test
- float errorval = compute_difference(config, bsd, workscb, blk);
- if (errorval == -ERROR_CALC_DEFAULT)
- {
- errorval = -errorval;
- workscb.block_type = SYM_BTYPE_ERROR;
- }
- trace_add_data("error_postrealign", errorval);
- best_errorval_in_mode = astc::min(errorval, best_errorval_in_mode);
- // Average refinement improvement is 3.5% per iteration, so skip blocks that are
- // unlikely to catch up with the best block we have already. Assume a 4.5% per step to
- // give benefit of the doubt ...
- unsigned int iters_remaining = config.tune_refinement_limit - 1 - l;
- float threshold = (0.045f * static_cast<float>(iters_remaining)) + 1.0f;
- if (errorval > (threshold * best_errorval_in_scb))
- {
- break;
- }
- if (errorval < best_errorval_in_scb)
- {
- best_errorval_in_scb = errorval;
- workscb.errorval = errorval;
- scb = workscb;
- if (errorval < tune_errorval_threshold)
- {
- // Skip remaining candidates - this is "good enough"
- i = candidate_count;
- break;
- }
- }
- if (!adjustments)
- {
- break;
- }
- }
- }
- return best_errorval_in_mode;
- }
- /**
- * @brief Compress a block using a chosen partitioning and 2 planes of weights.
- *
- * @param config The compressor configuration.
- * @param bsd The block size information.
- * @param blk The image block color data to compress.
- * @param tune_errorval_threshold The error value threshold.
- * @param plane2_component The component index for the second plane of weights.
- * @param[out] scb The symbolic compressed block output.
- * @param[out] tmpbuf The quantized weights for plane 1.
- */
- static float compress_symbolic_block_for_partition_2planes(
- const astcenc_config& config,
- const block_size_descriptor& bsd,
- const image_block& blk,
- float tune_errorval_threshold,
- unsigned int plane2_component,
- symbolic_compressed_block& scb,
- compression_working_buffers& tmpbuf,
- int quant_limit
- ) {
- promise(config.tune_candidate_limit > 0);
- promise(config.tune_refinement_limit > 0);
- promise(bsd.decimation_mode_count_selected > 0);
- int max_weight_quant = astc::min(static_cast<int>(QUANT_32), quant_limit);
- // Compute ideal weights and endpoint colors, with no quantization or decimation
- endpoints_and_weights& ei1 = tmpbuf.ei1;
- endpoints_and_weights& ei2 = tmpbuf.ei2;
- compute_ideal_colors_and_weights_2planes(bsd, blk, plane2_component, ei1, ei2);
- // Compute ideal weights and endpoint colors for every decimation
- float* dec_weights_ideal = tmpbuf.dec_weights_ideal;
- uint8_t* dec_weights_uquant = tmpbuf.dec_weights_uquant;
- // For each decimation mode, compute an ideal set of weights with no quantization
- for (unsigned int i = 0; i < bsd.decimation_mode_count_selected; i++)
- {
- const auto& dm = bsd.get_decimation_mode(i);
- if (!dm.is_ref_2plane(static_cast<quant_method>(max_weight_quant)))
- {
- continue;
- }
- const auto& di = bsd.get_decimation_info(i);
- compute_ideal_weights_for_decimation(
- ei1,
- di,
- dec_weights_ideal + i * BLOCK_MAX_WEIGHTS);
- compute_ideal_weights_for_decimation(
- ei2,
- di,
- dec_weights_ideal + i * BLOCK_MAX_WEIGHTS + WEIGHTS_PLANE2_OFFSET);
- }
- // Compute maximum colors for the endpoints and ideal weights, then for each endpoint and ideal
- // weight pair, compute the smallest weight that will result in a color value greater than 1
- vfloat4 min_ep1(10.0f);
- vfloat4 min_ep2(10.0f);
- vfloat4 ep1 = (vfloat4(1.0f) - ei1.ep.endpt0[0]) / (ei1.ep.endpt1[0] - ei1.ep.endpt0[0]);
- vmask4 use_ep1 = (ep1 > vfloat4(0.5f)) & (ep1 < min_ep1);
- min_ep1 = select(min_ep1, ep1, use_ep1);
- vfloat4 ep2 = (vfloat4(1.0f) - ei2.ep.endpt0[0]) / (ei2.ep.endpt1[0] - ei2.ep.endpt0[0]);
- vmask4 use_ep2 = (ep2 > vfloat4(0.5f)) & (ep2 < min_ep2);
- min_ep2 = select(min_ep2, ep2, use_ep2);
- vfloat4 err_max(ERROR_CALC_DEFAULT);
- vmask4 err_mask = vint4::lane_id() == vint4(plane2_component);
- // Set the plane2 component to max error in ep1
- min_ep1 = select(min_ep1, err_max, err_mask);
- float min_wt_cutoff1 = hmin_s(min_ep1);
- // Set the minwt2 to the plane2 component min in ep2
- float min_wt_cutoff2 = hmin_s(select(err_max, min_ep2, err_mask));
- compute_angular_endpoints_2planes(
- bsd, dec_weights_ideal, max_weight_quant, tmpbuf);
- // For each mode (which specifies a decimation and a quantization):
- // * Compute number of bits needed for the quantized weights
- // * Generate an optimized set of quantized weights
- // * Compute quantization errors for the mode
- float* weight_low_value1 = tmpbuf.weight_low_value1;
- float* weight_high_value1 = tmpbuf.weight_high_value1;
- float* weight_low_value2 = tmpbuf.weight_low_value2;
- float* weight_high_value2 = tmpbuf.weight_high_value2;
- int8_t* qwt_bitcounts = tmpbuf.qwt_bitcounts;
- float* qwt_errors = tmpbuf.qwt_errors;
- unsigned int start_2plane = bsd.block_mode_count_1plane_selected;
- unsigned int end_2plane = bsd.block_mode_count_1plane_2plane_selected;
- for (unsigned int i = start_2plane; i < end_2plane; i++)
- {
- const block_mode& bm = bsd.block_modes[i];
- assert(bm.is_dual_plane);
- if (bm.quant_mode > max_weight_quant)
- {
- qwt_errors[i] = 1e38f;
- continue;
- }
- qwt_bitcounts[i] = static_cast<int8_t>(109 - bm.weight_bits);
- if (weight_high_value1[i] > 1.02f * min_wt_cutoff1)
- {
- weight_high_value1[i] = 1.0f;
- }
- if (weight_high_value2[i] > 1.02f * min_wt_cutoff2)
- {
- weight_high_value2[i] = 1.0f;
- }
- unsigned int decimation_mode = bm.decimation_mode;
- const auto& di = bsd.get_decimation_info(decimation_mode);
- ASTCENC_ALIGNAS float dec_weights_uquantf[BLOCK_MAX_WEIGHTS];
- // Generate the optimized set of weights for the mode
- compute_quantized_weights_for_decimation(
- di,
- weight_low_value1[i],
- weight_high_value1[i],
- dec_weights_ideal + BLOCK_MAX_WEIGHTS * decimation_mode,
- dec_weights_uquantf,
- dec_weights_uquant + BLOCK_MAX_WEIGHTS * i,
- bm.get_weight_quant_mode());
- compute_quantized_weights_for_decimation(
- di,
- weight_low_value2[i],
- weight_high_value2[i],
- dec_weights_ideal + BLOCK_MAX_WEIGHTS * decimation_mode + WEIGHTS_PLANE2_OFFSET,
- dec_weights_uquantf + WEIGHTS_PLANE2_OFFSET,
- dec_weights_uquant + BLOCK_MAX_WEIGHTS * i + WEIGHTS_PLANE2_OFFSET,
- bm.get_weight_quant_mode());
- // Compute weight quantization errors for the block mode
- qwt_errors[i] = compute_error_of_weight_set_2planes(
- ei1,
- ei2,
- di,
- dec_weights_uquantf,
- dec_weights_uquantf + WEIGHTS_PLANE2_OFFSET);
- }
- // Decide the optimal combination of color endpoint encodings and weight encodings
- uint8_t partition_format_specifiers[TUNE_MAX_TRIAL_CANDIDATES][BLOCK_MAX_PARTITIONS];
- int block_mode_index[TUNE_MAX_TRIAL_CANDIDATES];
- quant_method color_quant_level[TUNE_MAX_TRIAL_CANDIDATES];
- quant_method color_quant_level_mod[TUNE_MAX_TRIAL_CANDIDATES];
- endpoints epm;
- merge_endpoints(ei1.ep, ei2.ep, plane2_component, epm);
- const auto& pi = bsd.get_partition_info(1, 0);
- unsigned int candidate_count = compute_ideal_endpoint_formats(
- pi, blk, epm, qwt_bitcounts, qwt_errors,
- config.tune_candidate_limit,
- bsd.block_mode_count_1plane_selected, bsd.block_mode_count_1plane_2plane_selected,
- partition_format_specifiers, block_mode_index,
- color_quant_level, color_quant_level_mod, tmpbuf);
- // Iterate over the N believed-to-be-best modes to find out which one is actually best
- float best_errorval_in_mode = ERROR_CALC_DEFAULT;
- float best_errorval_in_scb = scb.errorval;
- for (unsigned int i = 0; i < candidate_count; i++)
- {
- TRACE_NODE(node0, "candidate");
- const int bm_packed_index = block_mode_index[i];
- assert(bm_packed_index >= static_cast<int>(bsd.block_mode_count_1plane_selected) &&
- bm_packed_index < static_cast<int>(bsd.block_mode_count_1plane_2plane_selected));
- const block_mode& qw_bm = bsd.block_modes[bm_packed_index];
- int decimation_mode = qw_bm.decimation_mode;
- const auto& di = bsd.get_decimation_info(decimation_mode);
- promise(di.weight_count > 0);
- trace_add_data("weight_x", di.weight_x);
- trace_add_data("weight_y", di.weight_y);
- trace_add_data("weight_z", di.weight_z);
- trace_add_data("weight_quant", qw_bm.quant_mode);
- vfloat4 rgbs_color;
- vfloat4 rgbo_color;
- symbolic_compressed_block workscb;
- endpoints workep = epm;
- uint8_t* u8_weight1_src = dec_weights_uquant + BLOCK_MAX_WEIGHTS * bm_packed_index;
- uint8_t* u8_weight2_src = dec_weights_uquant + BLOCK_MAX_WEIGHTS * bm_packed_index + WEIGHTS_PLANE2_OFFSET;
- for (int j = 0; j < di.weight_count; j++)
- {
- workscb.weights[j] = u8_weight1_src[j];
- workscb.weights[j + WEIGHTS_PLANE2_OFFSET] = u8_weight2_src[j];
- }
- for (unsigned int l = 0; l < config.tune_refinement_limit; l++)
- {
- recompute_ideal_colors_2planes(
- blk, bsd, di,
- workscb.weights, workscb.weights + WEIGHTS_PLANE2_OFFSET,
- workep, rgbs_color, rgbo_color, plane2_component);
- // Quantize the chosen color
- workscb.color_formats[0] = pack_color_endpoints(
- workep.endpt0[0],
- workep.endpt1[0],
- rgbs_color, rgbo_color,
- partition_format_specifiers[i][0],
- workscb.color_values[0],
- color_quant_level[i]);
- // Store header fields
- workscb.partition_count = 1;
- workscb.partition_index = 0;
- workscb.quant_mode = color_quant_level[i];
- workscb.color_formats_matched = 0;
- workscb.block_mode = qw_bm.mode_index;
- workscb.plane2_component = static_cast<int8_t>(plane2_component);
- workscb.block_type = SYM_BTYPE_NONCONST;
- // Pre-realign test
- if (l == 0)
- {
- float errorval = compute_symbolic_block_difference_2plane(config, bsd, workscb, blk);
- if (errorval == -ERROR_CALC_DEFAULT)
- {
- errorval = -errorval;
- workscb.block_type = SYM_BTYPE_ERROR;
- }
- trace_add_data("error_prerealign", errorval);
- best_errorval_in_mode = astc::min(errorval, best_errorval_in_mode);
- // Average refinement improvement is 3.5% per iteration (allow 4.5%), but the first
- // iteration can help more so we give it a extra 8% leeway. Use this knowledge to
- // drive a heuristic to skip blocks that are unlikely to catch up with the best
- // block we have already.
- unsigned int iters_remaining = config.tune_refinement_limit - l;
- float threshold = (0.045f * static_cast<float>(iters_remaining)) + 1.08f;
- if (errorval > (threshold * best_errorval_in_scb))
- {
- break;
- }
- if (errorval < best_errorval_in_scb)
- {
- best_errorval_in_scb = errorval;
- workscb.errorval = errorval;
- scb = workscb;
- if (errorval < tune_errorval_threshold)
- {
- // Skip remaining candidates - this is "good enough"
- i = candidate_count;
- break;
- }
- }
- }
- // Perform a final pass over the weights to try to improve them.
- bool adjustments;
- if (di.weight_count != bsd.texel_count)
- {
- adjustments = realign_weights_decimated(
- config.profile, bsd, blk, workscb);
- }
- else
- {
- adjustments = realign_weights_undecimated(
- config.profile, bsd, blk, workscb);
- }
- // Post-realign test
- float errorval = compute_symbolic_block_difference_2plane(config, bsd, workscb, blk);
- if (errorval == -ERROR_CALC_DEFAULT)
- {
- errorval = -errorval;
- workscb.block_type = SYM_BTYPE_ERROR;
- }
- trace_add_data("error_postrealign", errorval);
- best_errorval_in_mode = astc::min(errorval, best_errorval_in_mode);
- // Average refinement improvement is 3.5% per iteration, so skip blocks that are
- // unlikely to catch up with the best block we have already. Assume a 4.5% per step to
- // give benefit of the doubt ...
- unsigned int iters_remaining = config.tune_refinement_limit - 1 - l;
- float threshold = (0.045f * static_cast<float>(iters_remaining)) + 1.0f;
- if (errorval > (threshold * best_errorval_in_scb))
- {
- break;
- }
- if (errorval < best_errorval_in_scb)
- {
- best_errorval_in_scb = errorval;
- workscb.errorval = errorval;
- scb = workscb;
- if (errorval < tune_errorval_threshold)
- {
- // Skip remaining candidates - this is "good enough"
- i = candidate_count;
- break;
- }
- }
- if (!adjustments)
- {
- break;
- }
- }
- }
- return best_errorval_in_mode;
- }
- /**
- * @brief Determine the lowest cross-channel correlation factor.
- *
- * @param texels_per_block The number of texels in a block.
- * @param blk The image block color data to compress.
- *
- * @return Return the lowest correlation factor.
- */
- static float prepare_block_statistics(
- int texels_per_block,
- const image_block& blk
- ) {
- // Compute covariance matrix, as a collection of 10 scalars that form the upper-triangular row
- // of the matrix. The matrix is symmetric, so this is all we need for this use case.
- float rs = 0.0f;
- float gs = 0.0f;
- float bs = 0.0f;
- float as = 0.0f;
- float rr_var = 0.0f;
- float gg_var = 0.0f;
- float bb_var = 0.0f;
- float aa_var = 0.0f;
- float rg_cov = 0.0f;
- float rb_cov = 0.0f;
- float ra_cov = 0.0f;
- float gb_cov = 0.0f;
- float ga_cov = 0.0f;
- float ba_cov = 0.0f;
- float weight_sum = 0.0f;
- promise(texels_per_block > 0);
- for (int i = 0; i < texels_per_block; i++)
- {
- float weight = hadd_s(blk.channel_weight) / 4.0f;
- assert(weight >= 0.0f);
- weight_sum += weight;
- float r = blk.data_r[i];
- float g = blk.data_g[i];
- float b = blk.data_b[i];
- float a = blk.data_a[i];
- float rw = r * weight;
- rs += rw;
- rr_var += r * rw;
- rg_cov += g * rw;
- rb_cov += b * rw;
- ra_cov += a * rw;
- float gw = g * weight;
- gs += gw;
- gg_var += g * gw;
- gb_cov += b * gw;
- ga_cov += a * gw;
- float bw = b * weight;
- bs += bw;
- bb_var += b * bw;
- ba_cov += a * bw;
- float aw = a * weight;
- as += aw;
- aa_var += a * aw;
- }
- float rpt = 1.0f / astc::max(weight_sum, 1e-7f);
- rr_var -= rs * (rs * rpt);
- rg_cov -= gs * (rs * rpt);
- rb_cov -= bs * (rs * rpt);
- ra_cov -= as * (rs * rpt);
- gg_var -= gs * (gs * rpt);
- gb_cov -= bs * (gs * rpt);
- ga_cov -= as * (gs * rpt);
- bb_var -= bs * (bs * rpt);
- ba_cov -= as * (bs * rpt);
- aa_var -= as * (as * rpt);
- // These will give a NaN if a channel is constant - these are fixed up in the next step
- rg_cov *= astc::rsqrt(rr_var * gg_var);
- rb_cov *= astc::rsqrt(rr_var * bb_var);
- ra_cov *= astc::rsqrt(rr_var * aa_var);
- gb_cov *= astc::rsqrt(gg_var * bb_var);
- ga_cov *= astc::rsqrt(gg_var * aa_var);
- ba_cov *= astc::rsqrt(bb_var * aa_var);
- if (astc::isnan(rg_cov)) rg_cov = 1.0f;
- if (astc::isnan(rb_cov)) rb_cov = 1.0f;
- if (astc::isnan(ra_cov)) ra_cov = 1.0f;
- if (astc::isnan(gb_cov)) gb_cov = 1.0f;
- if (astc::isnan(ga_cov)) ga_cov = 1.0f;
- if (astc::isnan(ba_cov)) ba_cov = 1.0f;
- float lowest_correlation = astc::min(fabsf(rg_cov), fabsf(rb_cov));
- lowest_correlation = astc::min(lowest_correlation, fabsf(ra_cov));
- lowest_correlation = astc::min(lowest_correlation, fabsf(gb_cov));
- lowest_correlation = astc::min(lowest_correlation, fabsf(ga_cov));
- lowest_correlation = astc::min(lowest_correlation, fabsf(ba_cov));
- // Diagnostic trace points
- trace_add_data("min_r", blk.data_min.lane<0>());
- trace_add_data("max_r", blk.data_max.lane<0>());
- trace_add_data("min_g", blk.data_min.lane<1>());
- trace_add_data("max_g", blk.data_max.lane<1>());
- trace_add_data("min_b", blk.data_min.lane<2>());
- trace_add_data("max_b", blk.data_max.lane<2>());
- trace_add_data("min_a", blk.data_min.lane<3>());
- trace_add_data("max_a", blk.data_max.lane<3>());
- trace_add_data("cov_rg", fabsf(rg_cov));
- trace_add_data("cov_rb", fabsf(rb_cov));
- trace_add_data("cov_ra", fabsf(ra_cov));
- trace_add_data("cov_gb", fabsf(gb_cov));
- trace_add_data("cov_ga", fabsf(ga_cov));
- trace_add_data("cov_ba", fabsf(ba_cov));
- return lowest_correlation;
- }
- /* See header for documentation. */
- void compress_block(
- const astcenc_contexti& ctx,
- const image_block& blk,
- uint8_t pcb[16],
- compression_working_buffers& tmpbuf)
- {
- astcenc_profile decode_mode = ctx.config.profile;
- symbolic_compressed_block scb;
- const block_size_descriptor& bsd = *ctx.bsd;
- float lowest_correl;
- TRACE_NODE(node0, "block");
- trace_add_data("pos_x", blk.xpos);
- trace_add_data("pos_y", blk.ypos);
- trace_add_data("pos_z", blk.zpos);
- // Set stricter block targets for luminance data as we have more bits to play with
- bool block_is_l = blk.is_luminance();
- float block_is_l_scale = block_is_l ? 1.0f / 1.5f : 1.0f;
- // Set slightly stricter block targets for lumalpha data as we have more bits to play with
- bool block_is_la = blk.is_luminancealpha();
- float block_is_la_scale = block_is_la ? 1.0f / 1.05f : 1.0f;
- bool block_skip_two_plane = false;
- int max_partitions = ctx.config.tune_partition_count_limit;
- unsigned int requested_partition_indices[3] {
- ctx.config.tune_2partition_index_limit,
- ctx.config.tune_3partition_index_limit,
- ctx.config.tune_4partition_index_limit
- };
- unsigned int requested_partition_trials[3] {
- ctx.config.tune_2partitioning_candidate_limit,
- ctx.config.tune_3partitioning_candidate_limit,
- ctx.config.tune_4partitioning_candidate_limit
- };
- #if defined(ASTCENC_DIAGNOSTICS)
- // Do this early in diagnostic builds so we can dump uniform metrics
- // for every block. Do it later in release builds to avoid redundant work!
- float error_weight_sum = hadd_s(blk.channel_weight) * bsd.texel_count;
- float error_threshold = ctx.config.tune_db_limit
- * error_weight_sum
- * block_is_l_scale
- * block_is_la_scale;
- lowest_correl = prepare_block_statistics(bsd.texel_count, blk);
- trace_add_data("lowest_correl", lowest_correl);
- trace_add_data("tune_error_threshold", error_threshold);
- #endif
- // Detected a constant-color block
- if (all(blk.data_min == blk.data_max))
- {
- TRACE_NODE(node1, "pass");
- trace_add_data("partition_count", 0);
- trace_add_data("plane_count", 1);
- scb.partition_count = 0;
- // Encode as FP16 if using HDR
- if ((decode_mode == ASTCENC_PRF_HDR) ||
- (decode_mode == ASTCENC_PRF_HDR_RGB_LDR_A))
- {
- scb.block_type = SYM_BTYPE_CONST_F16;
- vint4 color_f16 = float_to_float16(blk.origin_texel);
- store(color_f16, scb.constant_color);
- }
- // Encode as UNORM16 if NOT using HDR
- else
- {
- scb.block_type = SYM_BTYPE_CONST_U16;
- vfloat4 color_f32 = clamp(0.0f, 1.0f, blk.origin_texel) * 65535.0f;
- vint4 color_u16 = float_to_int_rtn(color_f32);
- store(color_u16, scb.constant_color);
- }
- trace_add_data("exit", "quality hit");
- symbolic_to_physical(bsd, scb, pcb);
- return;
- }
- #if !defined(ASTCENC_DIAGNOSTICS)
- float error_weight_sum = hadd_s(blk.channel_weight) * bsd.texel_count;
- float error_threshold = ctx.config.tune_db_limit
- * error_weight_sum
- * block_is_l_scale
- * block_is_la_scale;
- #endif
- // Set SCB and mode errors to a very high error value
- scb.errorval = ERROR_CALC_DEFAULT;
- scb.block_type = SYM_BTYPE_ERROR;
- float best_errorvals_for_pcount[BLOCK_MAX_PARTITIONS] {
- ERROR_CALC_DEFAULT, ERROR_CALC_DEFAULT, ERROR_CALC_DEFAULT, ERROR_CALC_DEFAULT
- };
- float exit_thresholds_for_pcount[BLOCK_MAX_PARTITIONS] {
- 0.0f,
- ctx.config.tune_2partition_early_out_limit_factor,
- ctx.config.tune_3partition_early_out_limit_factor,
- 0.0f
- };
- // Trial using 1 plane of weights and 1 partition.
- // Most of the time we test it twice, first with a mode cutoff of 0 and then with the specified
- // mode cutoff. This causes an early-out that speeds up encoding of easy blocks. However, this
- // optimization is disabled for 4x4 and 5x4 blocks where it nearly always slows down the
- // compression and slightly reduces image quality.
- float errorval_mult[2] {
- 1.0f / ctx.config.tune_mse_overshoot,
- 1.0f
- };
- static const float errorval_overshoot = 1.0f / ctx.config.tune_mse_overshoot;
- // Only enable MODE0 fast path if enabled
- // Never enable for 3D blocks as no "always" block modes are available
- int start_trial = 1;
- if ((ctx.config.tune_search_mode0_enable >= TUNE_MIN_SEARCH_MODE0) && (bsd.zdim == 1))
- {
- start_trial = 0;
- }
- int quant_limit = QUANT_32;
- for (int i = start_trial; i < 2; i++)
- {
- TRACE_NODE(node1, "pass");
- trace_add_data("partition_count", 1);
- trace_add_data("plane_count", 1);
- trace_add_data("search_mode", i);
- float errorval = compress_symbolic_block_for_partition_1plane(
- ctx.config, bsd, blk, i == 0,
- error_threshold * errorval_mult[i] * errorval_overshoot,
- 1, 0, scb, tmpbuf, QUANT_32);
- // Record the quant level so we can use the filter later searches
- const auto& bm = bsd.get_block_mode(scb.block_mode);
- quant_limit = bm.get_weight_quant_mode();
- best_errorvals_for_pcount[0] = astc::min(best_errorvals_for_pcount[0], errorval);
- if (errorval < (error_threshold * errorval_mult[i]))
- {
- trace_add_data("exit", "quality hit");
- goto END_OF_TESTS;
- }
- }
- #if !defined(ASTCENC_DIAGNOSTICS)
- lowest_correl = prepare_block_statistics(bsd.texel_count, blk);
- #endif
- block_skip_two_plane = lowest_correl > ctx.config.tune_2plane_early_out_limit_correlation;
- // Test the four possible 1-partition, 2-planes modes. Do this in reverse, as
- // alpha is the most likely to be non-correlated if it is present in the data.
- for (int i = BLOCK_MAX_COMPONENTS - 1; i >= 0; i--)
- {
- TRACE_NODE(node1, "pass");
- trace_add_data("partition_count", 1);
- trace_add_data("plane_count", 2);
- trace_add_data("plane_component", i);
- if (block_skip_two_plane)
- {
- trace_add_data("skip", "tune_2plane_early_out_limit_correlation");
- continue;
- }
- if (blk.grayscale && i != 3)
- {
- trace_add_data("skip", "grayscale block");
- continue;
- }
- if (blk.is_constant_channel(i))
- {
- trace_add_data("skip", "constant component");
- continue;
- }
- float errorval = compress_symbolic_block_for_partition_2planes(
- ctx.config, bsd, blk, error_threshold * errorval_overshoot,
- i, scb, tmpbuf, quant_limit);
- // If attempting two planes is much worse than the best one plane result
- // then further two plane searches are unlikely to help so move on ...
- if (errorval > (best_errorvals_for_pcount[0] * 1.85f))
- {
- break;
- }
- if (errorval < error_threshold)
- {
- trace_add_data("exit", "quality hit");
- goto END_OF_TESTS;
- }
- }
- // Find best blocks for 2, 3 and 4 partitions
- for (int partition_count = 2; partition_count <= max_partitions; partition_count++)
- {
- unsigned int partition_indices[TUNE_MAX_PARTITIONING_CANDIDATES];
- unsigned int requested_indices = requested_partition_indices[partition_count - 2];
- unsigned int requested_trials = requested_partition_trials[partition_count - 2];
- requested_trials = astc::min(requested_trials, requested_indices);
- unsigned int actual_trials = find_best_partition_candidates(
- bsd, blk, partition_count, requested_indices, partition_indices, requested_trials);
- float best_error_in_prev = best_errorvals_for_pcount[partition_count - 2];
- for (unsigned int i = 0; i < actual_trials; i++)
- {
- TRACE_NODE(node1, "pass");
- trace_add_data("partition_count", partition_count);
- trace_add_data("partition_index", partition_indices[i]);
- trace_add_data("plane_count", 1);
- trace_add_data("search_mode", i);
- float errorval = compress_symbolic_block_for_partition_1plane(
- ctx.config, bsd, blk, false,
- error_threshold * errorval_overshoot,
- partition_count, partition_indices[i],
- scb, tmpbuf, quant_limit);
- best_errorvals_for_pcount[partition_count - 1] = astc::min(best_errorvals_for_pcount[partition_count - 1], errorval);
- // If using N partitions doesn't improve much over using N-1 partitions then skip trying
- // N+1. Error can dramatically improve if the data is correlated or non-correlated and
- // aligns with a partitioning that suits that encoding, so for this inner loop check add
- // a large error scale because the "other" trial could be a lot better.
- float best_error = best_errorvals_for_pcount[partition_count - 1];
- float best_error_scale = exit_thresholds_for_pcount[partition_count - 1] * 1.85f;
- if (best_error > (best_error_in_prev * best_error_scale))
- {
- trace_add_data("skip", "tune_partition_early_out_limit_factor");
- goto END_OF_TESTS;
- }
- if (errorval < error_threshold)
- {
- trace_add_data("exit", "quality hit");
- goto END_OF_TESTS;
- }
- }
- // If using N partitions doesn't improve much over using N-1 partitions then skip trying N+1
- float best_error = best_errorvals_for_pcount[partition_count - 1];
- float best_error_scale = exit_thresholds_for_pcount[partition_count - 1];
- if (best_error > (best_error_in_prev * best_error_scale))
- {
- trace_add_data("skip", "tune_partition_early_out_limit_factor");
- goto END_OF_TESTS;
- }
- }
- trace_add_data("exit", "quality not hit");
- END_OF_TESTS:
- // If we still have an error block then convert to something we can encode
- // TODO: Do something more sensible here, such as average color block
- if (scb.block_type == SYM_BTYPE_ERROR)
- {
- #if defined(ASTCENC_DIAGNOSTICS)
- static bool printed_once = false;
- if (!printed_once)
- {
- printed_once = true;
- printf("WARN: At least one block failed to find a valid encoding.\n"
- " Try increasing compression quality settings.\n\n");
- }
- #endif
- scb.block_type = SYM_BTYPE_CONST_U16;
- vfloat4 color_f32 = clamp(0.0f, 1.0f, blk.origin_texel) * 65535.0f;
- vint4 color_u16 = float_to_int_rtn(color_f32);
- store(color_u16, scb.constant_color);
- }
- // Compress to a physical block
- symbolic_to_physical(bsd, scb, pcb);
- }
- #endif
|