Quelle simple_encode.h

Sprache: C

/*
*  Copyright (c) 2019 The WebM project authors. All Rights Reserved.
*
*  Use of this source code is governed by a BSD-style license
*  that can be found in the LICENSE file in the root of the source
*  tree. An additional intellectual property rights grant can be found
*  in the file PATENTS.  All contributing project authors may
*  be found in the AUTHORS file in the root of the source tree.
*/

#ifndef VPX_VP9_SIMPLE_ENCODE_H_
#define VPX_VP9_SIMPLE_ENCODE_H_

#include <cstddef>
#include <cstdint>
#include <cstdio>
#include <memory>
#include <vector>

namespace vp9 {

enum StatusCode {
  StatusOk = 0,
  StatusError,
};

// TODO(angiebird): Add description for each frame type.
enum FrameType {
  kFrameTypeKey = 0,
  kFrameTypeInter = 1,
  kFrameTypeAltRef = 2,
  kFrameTypeOverlay = 3,
  kFrameTypeGolden = 4,
};

// TODO(angiebird): Add description for each reference frame type.
// This enum numbers have to be contiguous and start from zero except
// kNoneRefFrame.
enum RefFrameType {
  kRefFrameTypeLast = 0,
  kRefFrameTypePast = 1,
  kRefFrameTypeFuture = 2,
  kRefFrameTypeMax = 3,
  kRefFrameTypeNone = -1,
};

enum VP9_LEVEL {
  LEVEL_UNKNOWN = 0,
  LEVEL_AUTO = 1,
  LEVEL_1 = 10,
  LEVEL_1_1 = 11,
  LEVEL_2 = 20,
  LEVEL_2_1 = 21,
  LEVEL_3 = 30,
  LEVEL_3_1 = 31,
  LEVEL_4 = 40,
  LEVEL_4_1 = 41,
  LEVEL_5 = 50,
  LEVEL_5_1 = 51,
  LEVEL_5_2 = 52,
  LEVEL_6 = 60,
  LEVEL_6_1 = 61,
  LEVEL_6_2 = 62,
  LEVEL_MAX = 255
};

enum GopMapFlag {
  kGopMapFlagStart =
      1 << 0,  // Indicate this location is the start of a group of pictures.
  kGopMapFlagUseAltRef =
      1 << 1,  // Indicate this group of pictures will use an alt ref. Only set
               // this flag when kGopMapFlagStart is set.
};

// The frame is split to 4x4 blocks.
// This structure contains the information of each 4x4 block.
struct PartitionInfo {
  int row;           // row pixel offset of current 4x4 block
  int column;        // column pixel offset of current 4x4 block
  int row_start;     // row pixel offset of the start of the prediction block
  int column_start;  // column pixel offset of the start of the prediction block
  int width;         // prediction block width
  int height;        // prediction block height
};

constexpr int kMotionVectorSubPixelPrecision = 8;
constexpr int kMotionVectorFullPixelPrecision = 1;

// In the first pass. The frame is split to 16x16 blocks.
// This structure contains the information of each 16x16 block.
// In the second pass. The frame is split to 4x4 blocks.
// This structure contains the information of each 4x4 block.
struct MotionVectorInfo {
  // Number of valid motion vectors, always 0 if this block is in the key frame.
  // For inter frames, it could be 1 or 2.
  int mv_count;
  // The reference frame for motion vectors. If the second motion vector does
  // not exist (mv_count = 1), the reference frame is kNoneRefFrame.
  // Otherwise, the reference frame is either kRefFrameTypeLast, or
  // kRefFrameTypePast, or kRefFrameTypeFuture.
  RefFrameType ref_frame[2];
  // The row offset of motion vectors in the unit of pixel.
  // If the second motion vector does not exist, the value is 0.
  double mv_row[2];
  // The column offset of motion vectors in the unit of pixel.
  // If the second motion vector does not exist, the value is 0.
  double mv_column[2];
};

// Accumulated tpl stats of all blocks in one frame.
// For each frame, the tpl stats are computed per 32x32 block.
struct TplStatsInfo {
  // Intra complexity: the sum of absolute transform difference (SATD) of
  // intra predicted residuals.
  int64_t intra_cost;
  // Inter complexity: the SATD of inter predicted residuals.
  int64_t inter_cost;
  // Motion compensated information flow. It measures how much information
  // is propagated from the current frame to other frames.
  int64_t mc_flow;
  // Motion compensated dependency cost. It equals to its own intra_cost
  // plus the mc_flow.
  int64_t mc_dep_cost;
  // Motion compensated reference cost.
  int64_t mc_ref_cost;
};

struct RefFrameInfo {
  int coding_indexes[kRefFrameTypeMax];

  // Indicate whether the reference frames are available or not.
  // When the reference frame type is not valid, it means either the to-be-coded
  // frame is a key frame or the reference frame already appears in other
  // reference frame type. vp9 always keeps three types of reference frame
  // available.  However, the duplicated reference frames will not be
  // chosen by the encoder. The priorities of choosing reference frames are
  // kRefFrameTypeLast > kRefFrameTypePast > kRefFrameTypeFuture.
  // For example, if kRefFrameTypeLast and kRefFrameTypePast both point to the
  // same frame, kRefFrameTypePast will be set to invalid.
  // 1: the ref frame type is available 0: the ref frame type is not available
  int valid_list[kRefFrameTypeMax];
};

bool operator==(const RefFrameInfo &a, const RefFrameInfo &b);

struct EncodeFrameInfo {
  int show_idx;

  // Each show or no show frame is assigned with a coding index based on its
  // coding order (starting from zero) in the coding process of the entire
  // video. The coding index for each frame is unique.
  int coding_index;
  RefFrameInfo ref_frame_info;
  FrameType frame_type;
};

// This structure is a copy of vp9 |nmv_component_counts|.
struct NewMotionvectorComponentCounts {
  std::vector<unsigned int> sign;
  std::vector<unsigned int> classes;
  std::vector<unsigned int> class0;
  std::vector<std::vector<unsigned int>> bits;
  std::vector<std::vector<unsigned int>> class0_fp;
  std::vector<unsigned int> fp;
  std::vector<unsigned int> class0_hp;
  std::vector<unsigned int> hp;
};

// This structure is a copy of vp9 |nmv_context_counts|.
struct NewMotionVectorContextCounts {
  std::vector<unsigned int> joints;
  std::vector<NewMotionvectorComponentCounts> comps;
};

using UintArray2D = std::vector<std::vector<unsigned int>>;
using UintArray3D = std::vector<std::vector<std::vector<unsigned int>>>;
using UintArray5D = std::vector<
    std::vector<std::vector<std::vector<std::vector<unsigned int>>>>>;
using UintArray6D = std::vector<std::vector<
    std::vector<std::vector<std::vector<std::vector<unsigned int>>>>>>;

// This structure is a copy of vp9 |tx_counts|.
struct TransformSizeCounts {
  // Transform size found in blocks of partition size 32x32.
  // First dimension: transform size contexts (2).
  // Second dimension: transform size type (3: 32x32, 16x16, 8x8)
  UintArray2D p32x32;
  // Transform size found in blocks of partition size 16x16.
  // First dimension: transform size contexts (2).
  // Second dimension: transform size type (2: 16x16, 8x8)
  UintArray2D p16x16;
  // Transform size found in blocks of partition size 8x8.
  // First dimension: transform size contexts (2).
  // Second dimension: transform size type (1: 8x8)
  UintArray2D p8x8;
  // Overall transform size count.
  std::vector<unsigned int> tx_totals;
};

// This structure is a copy of vp9 |FRAME_COUNTS|.
struct FrameCounts {
  // Intra prediction mode for luma plane. First dimension: block size (4).
  // Second dimension: intra prediction mode (10).
  UintArray2D y_mode;
  // Intra prediction mode for chroma plane. First and second dimension:
  // intra prediction mode (10).
  UintArray2D uv_mode;
  // Partition type. First dimension: partition contexts (16).
  // Second dimension: partition type (4).
  UintArray2D partition;
  // Transform coefficient.
  UintArray6D coef;
  // End of block (the position of the last non-zero transform coefficient)
  UintArray5D eob_branch;
  // Interpolation filter type. First dimension: switchable filter contexts (4).
  // Second dimension: filter types (3).
  UintArray2D switchable_interp;
  // Inter prediction mode (the motion vector type).
  // First dimension: inter mode contexts (7).
  // Second dimension: mode type (4).
  UintArray2D inter_mode;
  // Block is intra or inter predicted. First dimension: contexts (4).
  // Second dimension: type (0 for intra, 1 for inter).
  UintArray2D intra_inter;
  // Block is compound predicted (predicted from average of two blocks).
  // First dimension: contexts (5).
  // Second dimension: type (0 for single, 1 for compound prediction).
  UintArray2D comp_inter;
  // Type of the reference frame. Only one reference frame.
  // First dimension: context (5). Second dimension: context (2).
  // Third dimension: count (2).
  UintArray3D single_ref;
  // Type of the two reference frames.
  // First dimension: context (5). Second dimension: count (2).
  UintArray2D comp_ref;
  // Block skips transform and quantization, uses prediction as reconstruction.
  // First dimension: contexts (3). Second dimension: type (0 not skip, 1 skip).
  UintArray2D skip;
  // Transform size.
  TransformSizeCounts tx;
  // New motion vector.
  NewMotionVectorContextCounts mv;
};

struct ImageBuffer {
  // The image data is stored in raster order,
  // i.e. image[plane][r][c] =
  // plane_buffer[plane][r * plane_width[plane] + plane_height[plane]].
  std::unique_ptr<unsigned char[]> plane_buffer[3];
  int plane_width[3];
  int plane_height[3];
};

void output_image_buffer(const ImageBuffer &image_buffer, std::FILE *out_file);

struct EncodeFrameResult {
  int show_idx;
  FrameType frame_type;
  int coding_idx;
  RefFrameInfo ref_frame_info;
  size_t coding_data_bit_size;
  size_t coding_data_byte_size;
  // The EncodeFrame will allocate a buffer, write the coding data into the
  // buffer and give the ownership of the buffer to coding_data.
  std::unique_ptr<unsigned char[]> coding_data;
  size_t max_coding_data_byte_size;
  double psnr;
  uint64_t sse;
  int quantize_index;
  FrameCounts frame_counts;
  int num_rows_4x4;  // number of row units, in size of 4.
  int num_cols_4x4;  // number of column units, in size of 4.
  // A vector of the partition information of the frame.
  // The number of elements is |num_rows_4x4| * |num_cols_4x4|.
  // The frame is divided 4x4 blocks of |num_rows_4x4| rows and
  // |num_cols_4x4| columns.
  // Each 4x4 block contains the current pixel position (|row|, |column|),
  // the start pixel position of the partition (|row_start|, |column_start|),
  // and the |width|, |height| of the partition.
  // The current pixel position can be the same as the start pixel position
  // if the 4x4 block is the top-left block in the partition. Otherwise, they
  // are different.
  // Within the same partition, all 4x4 blocks have the same |row_start|,
  // |column_start|, |width| and |height|.
  // For example, if the frame is partitioned to a 32x32 block,
  // starting at (0, 0). Then, there're 64 4x4 blocks within this partition.
  // They all have the same |row_start|, |column_start|, |width|, |height|,
  // which can be used to figure out the start of the current partition and
  // the start of the next partition block.
  // Horizontal next: |column_start| + |width|,
  // Vertical next: |row_start| + |height|.
  std::vector<PartitionInfo> partition_info;
  // A vector of the motion vector information of the frame.
  // The number of elements is |num_rows_4x4| * |num_cols_4x4|.
  // The frame is divided into 4x4 blocks of |num_rows_4x4| rows and
  // |num_cols_4x4| columns.
  // Each 4x4 block contains 0 motion vector if this is an intra predicted
  // frame (for example, the key frame). If the frame is inter predicted,
  // each 4x4 block contains either 1 or 2 motion vectors.
  // Similar to partition info, all 4x4 blocks inside the same partition block
  // share the same motion vector information.
  std::vector<MotionVectorInfo> motion_vector_info;
  // A vector of the tpl stats information.
  // The tpl stats measure the complexity of a frame, as well as the
  // information propagated along the motion trajectory between frames, in
  // the reference frame structure.
  // The tpl stats could be used as a more accurate spatial and temporal
  // complexity measure in addition to the first pass stats.
  // The vector contains tpl stats for all show frames in a GOP.
  // The tpl stats stored in the vector is according to the encoding order.
  // For example, suppose there are N show frames for the current GOP.
  // Then tpl_stats_info[0] stores the information of the first frame to be
  // encoded for this GOP, i.e., the AltRef frame.
  std::vector<TplStatsInfo> tpl_stats_info;
  ImageBuffer coded_frame;

  // recode_count, q_index_history and rate_history are only available when
  // EncodeFrameWithTargetFrameBits() is used.
  int recode_count;
  std::vector<int> q_index_history;
  std::vector<int> rate_history;
};

struct GroupOfPicture {
  // This list will be updated internally in StartEncode() and
  // EncodeFrame()/EncodeFrameWithQuantizeIndex().
  // In EncodeFrame()/EncodeFrameWithQuantizeIndex(), the update will only be
  // triggered when the coded frame is the last one in the previous group of
  // pictures.
  std::vector<EncodeFrameInfo> encode_frame_list;

  // Indicates the index of the next coding frame in encode_frame_list.
  // In other words, EncodeFrameInfo of the next coding frame can be
  // obtained with encode_frame_list[next_encode_frame_index].
  // Internally, next_encode_frame_index will be set to zero after the last
  // frame of the group of pictures is coded. Otherwise, next_encode_frame_index
  // will be increased after each EncodeFrame()/EncodeFrameWithQuantizeIndex()
  // call.
  int next_encode_frame_index;

  // Number of show frames in this group of pictures.
  int show_frame_count;

  // The show index/timestamp of the earliest show frame in the group of
  // pictures.
  int start_show_index;

  // The coding index of the first coding frame in the group of pictures.
  int start_coding_index;

  // Indicates whether this group of pictures starts with a key frame.
  int first_is_key_frame;

  // Indicates whether this group of pictures uses an alt ref.
  int use_alt_ref;

  // Indicates whether previous group of pictures used an alt ref.
  int last_gop_use_alt_ref;
};

class SimpleEncode {
public:
  // When outfile_path is set, the encoder will output the bitstream in ivf
  // format.
  SimpleEncode(int frame_width, int frame_height, int frame_rate_num,
               int frame_rate_den, int target_bitrate, int num_frames,
               int target_level, const char *infile_path,
               const char *outfile_path = nullptr);
  ~SimpleEncode();
  SimpleEncode(SimpleEncode &) = delete;
  SimpleEncode &operator=(const SimpleEncode &) = delete;

  // Adjusts the encoder's coding speed.
  // If this function is not called, the encoder will use default encode_speed
  // 0. Call this function before ComputeFirstPassStats() if needed.
  // The encode_speed is equivalent to --cpu-used of the vpxenc command.
  // The encode_speed's range should be [0, 9].
  // Setting the encode_speed to a higher level will yield faster coding
  // at the cost of lower compression efficiency.
  void SetEncodeSpeed(int encode_speed);

  // Set encoder config
  // The following configs in VP9EncoderConfig are allowed to change in this
  // function. See https://ffmpeg.org/ffmpeg-codecs.html#libvpx for each
  // config's meaning.
  // Configs in VP9EncoderConfig:          Equivalent configs in ffmpeg:
  // 1  key_freq                           -g
  // 2  two_pass_vbrmin_section            -minrate * 100LL / bit_rate
  // 3  two_pass_vbrmax_section            -maxrate * 100LL / bit_rate
  // 4  under_shoot_pct                    -undershoot-pct
  // 5  over_shoot_pct                     -overshoot-pct
  // 6  max_threads                        -threads
  // 7  frame_parallel_decoding_mode       -frame-parallel
  // 8  tile_column                        -tile-columns
  // 9  arnr_max_frames                    -arnr-maxframes
  // 10 arnr_strength                      -arnr-strength
  // 11 lag_in_frames                      -rc_lookahead
  // 12 encode_breakout                    -static-thresh
  // 13 enable_tpl_model                   -enable-tpl
  // 14 enable_auto_arf                    -auto-alt-ref
  // 15 rc_mode
  //    Possible Settings:
  //      0 - Variable Bit Rate (VPX_VBR)  -b:v <bit_rate>
  //      1 - Constant Bit Rate (VPX_CBR)  -b:v <bit_rate> -minrate <bit_rate>
  //                                        -maxrate <bit_rate>
  //        two_pass_vbrmin_section == 100   i.e. bit_rate == minrate == maxrate
  //        two_pass_vbrmax_section == 100
  //      2 - Constrained Quality (VPX_CQ) -crf <cq_level> -b:v bit_rate
  //      3 - Constant Quality (VPX_Q)     -crf <cq_level> -b:v 0
  //    See https://trac.ffmpeg.org/wiki/Encode/VP9 for more details.
  // 16 cq_level                          see rc_mode for details.
  StatusCode SetEncodeConfig(const char *name, const char *value);

  // A debug function that dumps configs from VP9EncoderConfig
  // pass = 1: first pass, pass = 2: second pass
  // fp: file pointer for dumping config
  StatusCode DumpEncodeConfigs(int pass, FILE *fp);

  // Makes encoder compute the first pass stats and store it at
  // impl_ptr_->first_pass_stats. key_frame_map_ is also computed based on the
  // first pass stats.
  void ComputeFirstPassStats();

  // Outputs the first pass stats represented by a 2-D vector.
  // One can use the frame index at first dimension to retrieve the stats for
  // each video frame. The stats of each video frame is a vector of 25 double
  // values. For details, please check FIRSTPASS_STATS in vp9_firstpass.h
  std::vector<std::vector<double>> ObserveFirstPassStats();

  // Outputs the first pass motion vectors represented by a 2-D vector.
  // One can use the frame index at first dimension to retrieve the mvs for
  // each video frame. The frame is divided into 16x16 blocks. The number of
  // elements is round_up(|num_rows_4x4| / 4) * round_up(|num_cols_4x4| / 4).
  std::vector<std::vector<MotionVectorInfo>> ObserveFirstPassMotionVectors();

  // Ouputs a copy of key_frame_map_, a binary vector with size equal to the
  // number of show frames in the video. For each entry in the vector, 1
  // indicates the position is a key frame and 0 indicates it's not a key frame.
  // This function should be called after ComputeFirstPassStats()
  std::vector<int> ObserveKeyFrameMap() const;

  // Sets group of pictures map for coding the entire video.
  // Each entry in the gop_map corresponds to a show frame in the video.
  // Therefore, the size of gop_map should equal to the number of show frames in
  // the entire video.
  // If a given entry's kGopMapFlagStart is set, it means this is the start of a
  // gop. Once kGopMapFlagStart is set, one can set kGopMapFlagUseAltRef to
  // indicate whether this gop use altref.
  // If a given entry is zero, it means it's in the middle of a gop.
  // This function should be called only once after ComputeFirstPassStats(),
  // before StartEncode().
  // This API will check and modify the gop_map to satisfy the following
  // constraints.
  // 1) Each key frame position should be at the start of a gop.
  // 2) The last gop should not use an alt ref.
  void SetExternalGroupOfPicturesMap(int *gop_map, int gop_map_size);

  // Observe the group of pictures map set through
  // SetExternalGroupOfPicturesMap(). This function should be called after
  // SetExternalGroupOfPicturesMap().
  std::vector<int> ObserveExternalGroupOfPicturesMap();

  // Initializes the encoder for actual encoding.
  // This function should be called after ComputeFirstPassStats().
  void StartEncode();

  // Frees the encoder.
  // This function should be called after StartEncode() or EncodeFrame().
  void EndEncode();

  // The key frame group size includes one key frame plus the number of
  // following inter frames. Note that the key frame group size only counts the
  // show frames. The number of no show frames like alternate refereces are not
  // counted.
  int GetKeyFrameGroupSize() const;

  // Provides the group of pictures that the next coding frame is in.
  // Only call this function between StartEncode() and EndEncode()
  GroupOfPicture ObserveGroupOfPicture() const;

  // Gets encode_frame_info for the next coding frame.
  // Only call this function between StartEncode() and EndEncode()
  EncodeFrameInfo GetNextEncodeFrameInfo() const;

  // Encodes a frame
  // This function should be called after StartEncode() and before EndEncode().
  void EncodeFrame(EncodeFrameResult *encode_frame_result);

  // Encodes a frame with a specific quantize index.
  // This function should be called after StartEncode() and before EndEncode().
  void EncodeFrameWithQuantizeIndex(EncodeFrameResult *encode_frame_result,
                                    int quantize_index);

  // Encode a frame with target frame bits usage.
  // The encoder will find a quantize index to make the actual frame bits usage
  // match the target. EncodeFrameWithTargetFrameBits() will recode the frame
  // up to 7 times to find a q_index to make the actual_frame_bits satisfy the
  // following inequality. |actual_frame_bits - target_frame_bits| * 100 /
  // target_frame_bits
  // <= percent_diff.
  void EncodeFrameWithTargetFrameBits(EncodeFrameResult *encode_frame_result,
                                      int target_frame_bits,
                                      double percent_diff);

  // Gets the number of coding frames for the video. The coding frames include
  // show frame and no show frame.
  // This function should be called after ComputeFirstPassStats().
  int GetCodingFrameNum() const;

  // Gets the total number of pixels of YUV planes per frame.
  uint64_t GetFramePixelCount() const;

private:
  // Compute the key frame locations of the video based on first pass stats.
  // The results are returned as a binary vector with 1s indicating keyframes
  // and 0s indicating non keyframes.
  // It has to be called after impl_ptr_->first_pass_stats is computed.
  std::vector<int> ComputeKeyFrameMap() const;

  // Updates key_frame_group_size_, reset key_frame_group_index_ and init
  // ref_frame_info_.
  void UpdateKeyFrameGroup(int key_frame_show_index);

  // Update key_frame_group_index_.
  void PostUpdateKeyFrameGroupIndex(FrameType frame_type);

  void PostUpdateState(const EncodeFrameResult &encode_frame_result);

  class EncodeImpl;

  int frame_width_;   // frame width in pixels.
  int frame_height_;  // frame height in pixels.
  int frame_rate_num_;
  int frame_rate_den_;
  int target_bitrate_;
  int num_frames_;
  int encode_speed_;
  int target_level_;

  std::FILE *in_file_;
  std::FILE *out_file_;
  std::unique_ptr<EncodeImpl> impl_ptr_;

  std::vector<int> key_frame_map_;
  std::vector<int> gop_map_;
  GroupOfPicture group_of_picture_;

  // The key frame group size includes one key frame plus the number of
  // following inter frames. Note that the key frame group size only counts the
  // show frames. The number of no show frames like alternate references are not
  // counted.
  int key_frame_group_size_;

  // The index for the to-be-coded show frame in the key frame group.
  int key_frame_group_index_;

  // Each show or no show frame is assigned with a coding index based on its
  // coding order (starting from zero) in the coding process of the entire
  // video. The coding index of the to-be-coded frame.
  int frame_coding_index_;

  // Number of show frames we have coded so far.
  int show_frame_count_;

  // TODO(angiebird): Do we need to reset ref_frames_info_ when the next key
  // frame appears?
  // Reference frames info of the to-be-coded frame.
  RefFrameInfo ref_frame_info_;

  // A 2-D vector of motion vector information of the frame collected
  // from the first pass. The first dimension is the frame index.
  // Each frame is divided into 16x16 blocks. The number of elements is
  // round_up(|num_rows_4x4| / 4) * round_up(|num_cols_4x4| / 4).
  // Each 16x16 block contains 0 motion vector if this is an intra predicted
  // frame (for example, the key frame). If the frame is inter predicted,
  // each 16x16 block contains either 1 or 2 motion vectors.
  // The first motion vector is always from the LAST_FRAME.
  // The second motion vector is always from the GOLDEN_FRAME.
  std::vector<std::vector<MotionVectorInfo>> fp_motion_vector_info_;
};

}  // namespace vp9

#endif  // VPX_VP9_SIMPLE_ENCODE_H_

Messung V0.5 in Prozent

¤ Dauer der Verarbeitung: 0.13 Sekunden (vorverarbeitet am 2026-04-28) ¤

Wurzel

Suchen

Beweissystem der NASA

Beweissystem Isabelle

NIST Cobol Testsuite

Cephes Mathematical Library

Wiener Entwicklungsmethode

Haftungshinweis

Die Informationen auf dieser Webseite wurden nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit, noch Qualität der bereit gestellten Informationen zugesichert.

Bemerkung:

Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.