OpenShot Library | libopenshot  0.5.0
AudioWaveformer.cpp
Go to the documentation of this file.
1 
9 // Copyright (c) 2008-2022 OpenShot Studios, LLC
10 //
11 // SPDX-License-Identifier: LGPL-3.0-or-later
12 
13 #include "AudioWaveformer.h"
14 
15 #include <cmath>
16 
17 #include <algorithm>
18 #include <chrono>
19 #include <memory>
20 #include <thread>
21 #include <vector>
22 
23 #include "Clip.h"
24 #include "Exceptions.h"
25 #include "FrameMapper.h"
26 #include "FFmpegReader.h"
27 #include "Timeline.h"
28 
29 
30 using namespace std;
31 using namespace openshot;
32 
33 
34 // Default constructor
35 AudioWaveformer::AudioWaveformer(ReaderBase* new_reader) :
36  reader(new_reader),
37  detached_reader(nullptr),
38  resolved_reader(nullptr),
39  source_initialized(false)
40 {
41 
42 }
43 
44 // Destructor
46 {
47 
48 }
49 
50 // Extract audio samples from any ReaderBase class
51 AudioWaveformData AudioWaveformer::ExtractSamples(int channel, int num_per_second, bool normalize) {
52  // Legacy entry point: resolve a source reader (unwrap Clip/FrameMapper), then extract audio-only.
53  AudioWaveformData data;
54  if (!reader) {
55  return data;
56  }
57 
58  ReaderBase* source = ResolveWaveformReader();
59 
60  Fraction source_fps = ResolveSourceFPS(source);
61 
62  AudioWaveformData base = ExtractSamplesFromReader(source, channel, num_per_second, false);
63 
64  // If this is a Clip, apply its keyframes using project fps (timeline if available, else reader fps)
65  if (auto clip = dynamic_cast<Clip*>(reader)) {
66  Timeline* timeline = dynamic_cast<Timeline*>(clip->ParentTimeline());
67  Fraction project_fps = timeline ? timeline->info.fps : clip->Reader()->info.fps;
68  return ApplyKeyframes(base, &clip->time, &clip->volume, project_fps, source_fps, source->info.channels, num_per_second, channel, normalize);
69  }
70 
71  // No keyframes to apply
72  if (normalize) {
73  float max_sample = 0.0f;
74  for (auto v : base.max_samples) {
75  max_sample = std::max(max_sample, std::abs(v));
76  }
77  if (max_sample > 0.0f) {
78  base.scale(static_cast<int>(base.max_samples.size()), 1.0f / max_sample);
79  }
80  }
81  return base;
82 }
83 
84 AudioWaveformData AudioWaveformer::ExtractSamples(const std::string& path, int channel, int num_per_second, bool normalize) {
85  FFmpegReader temp_reader(path);
86  temp_reader.Open();
87  // Disable video for speed
88  bool has_video = temp_reader.info.has_video;
89  temp_reader.info.has_video = false;
90  AudioWaveformData data = ExtractSamplesFromReader(&temp_reader, channel, num_per_second, normalize);
91  temp_reader.info.has_video = has_video;
92  temp_reader.Close();
93  return data;
94 }
95 
97  const Keyframe* time_keyframe,
98  const Keyframe* volume_keyframe,
99  const Fraction& project_fps,
100  int channel,
101  int num_per_second,
102  bool normalize) {
103  FFmpegReader temp_reader(path);
104  temp_reader.Open();
105  bool has_video = temp_reader.info.has_video;
106  temp_reader.info.has_video = false;
107  Fraction source_fps = temp_reader.info.fps;
108  AudioWaveformData base = ExtractSamplesFromReader(&temp_reader, channel, num_per_second, false);
109  temp_reader.info.has_video = has_video;
110  temp_reader.Close();
111  return ApplyKeyframes(base, time_keyframe, volume_keyframe, project_fps, source_fps, temp_reader.info.channels, num_per_second, channel, normalize);
112 }
113 
115  const Keyframe* time_keyframe,
116  const Keyframe* volume_keyframe,
117  const Fraction& project_fps,
118  const Fraction& source_fps,
119  int source_channels,
120  int num_per_second,
121  int channel,
122  bool normalize) {
123  AudioWaveformData data;
124  if (num_per_second <= 0) {
125  return data;
126  }
127 
128  double project_fps_value = project_fps.ToDouble();
129  double source_fps_value = source_fps.ToDouble();
130  if (project_fps_value <= 0.0 || source_fps_value <= 0.0) {
131  return data;
132  }
133 
134  if (channel != -1 && (channel < 0 || channel >= source_channels)) {
135  return data;
136  }
137 
138  size_t base_total = base.max_samples.size();
139  if (base_total == 0) {
140  return data;
141  }
142 
143  // Determine output duration from time curve (if any). Time curves are in project-frame domain.
144  int64_t output_frames = 0;
145  if (time_keyframe && time_keyframe->GetCount() > 0) {
146  output_frames = time_keyframe->GetLength();
147  }
148  if (output_frames <= 0) {
149  // Default to source duration derived from base waveform length
150  double source_duration = static_cast<double>(base_total) / static_cast<double>(num_per_second);
151  output_frames = static_cast<int64_t>(std::llround(source_duration * project_fps_value));
152  }
153  double output_duration_seconds = static_cast<double>(output_frames) / project_fps_value;
154  int total_samples = static_cast<int>(std::ceil(output_duration_seconds * num_per_second));
155 
156  if (total_samples <= 0) {
157  return data;
158  }
159 
160  data.resize(total_samples);
161  data.zero(total_samples);
162 
163  for (int i = 0; i < total_samples; ++i) {
164  double out_time = static_cast<double>(i) / static_cast<double>(num_per_second);
165  // Time keyframes are defined in project-frame domain; evaluate using project frames
166  double project_frame = out_time * project_fps_value;
167  double mapped_project_frame = time_keyframe ? time_keyframe->GetValue(project_frame) : project_frame;
168  // Convert mapped project frame to seconds (project FPS), then to waveform index
169  double source_time = mapped_project_frame / project_fps_value;
170  double source_index = source_time * static_cast<double>(num_per_second);
171 
172  // Sample base waveform (nearest with simple linear blend)
173  int idx0 = static_cast<int>(std::floor(source_index));
174  int idx1 = idx0 + 1;
175  double frac = source_index - static_cast<double>(idx0);
176 
177  float max_sample = 0.0f;
178  float rms_sample = 0.0f;
179  if (idx0 >= 0 && idx0 < static_cast<int>(base_total)) {
180  max_sample = base.max_samples[idx0];
181  rms_sample = base.rms_samples[idx0];
182  }
183  if (idx1 >= 0 && idx1 < static_cast<int>(base_total)) {
184  max_sample = static_cast<float>((1.0 - frac) * max_sample + frac * base.max_samples[idx1]);
185  rms_sample = static_cast<float>((1.0 - frac) * rms_sample + frac * base.rms_samples[idx1]);
186  }
187 
188  double gain = 1.0;
189  if (volume_keyframe) {
190  double project_frame = out_time * project_fps_value;
191  gain = volume_keyframe->GetValue(project_frame);
192  }
193  max_sample = static_cast<float>(max_sample * gain);
194  rms_sample = static_cast<float>(rms_sample * gain);
195 
196  data.max_samples[i] = max_sample;
197  data.rms_samples[i] = rms_sample;
198  }
199 
200  if (normalize) {
201  float samples_max = 0.0f;
202  for (auto v : data.max_samples) {
203  samples_max = std::max(samples_max, std::abs(v));
204  }
205  if (samples_max > 0.0f) {
206  data.scale(total_samples, 1.0f / samples_max);
207  }
208  }
209 
210  return data;
211 }
212 
213 AudioWaveformData AudioWaveformer::ExtractSamplesFromReader(ReaderBase* source_reader, int channel, int num_per_second, bool normalize) {
214  AudioWaveformData data;
215 
216  if (!source_reader || num_per_second <= 0) {
217  return data;
218  }
219 
220  // Open reader (if needed)
221  if (!source_reader->IsOpen()) {
222  source_reader->Open();
223  }
224 
225  const auto retry_delay = std::chrono::milliseconds(100);
226  const auto max_wait_for_open = std::chrono::milliseconds(3000);
227 
228  auto get_frame_with_retry = [&](int64_t frame_number) -> std::shared_ptr<openshot::Frame> {
229  std::chrono::steady_clock::time_point wait_start;
230  bool waiting_for_open = false;
231  while (true) {
232  try {
233  return source_reader->GetFrame(frame_number);
234  } catch (const openshot::ReaderClosed&) {
235  auto now = std::chrono::steady_clock::now();
236  if (!waiting_for_open) {
237  waiting_for_open = true;
238  wait_start = now;
239  } else if (now - wait_start >= max_wait_for_open) {
240  throw;
241  }
242 
243  std::this_thread::sleep_for(retry_delay);
244  }
245  }
246  };
247 
248  int sample_rate = source_reader->info.sample_rate;
249  if (sample_rate <= 0) {
250  sample_rate = num_per_second;
251  }
252  int sample_divisor = sample_rate / num_per_second;
253  if (sample_divisor <= 0) {
254  sample_divisor = 1;
255  }
256 
257  // Determine length of video frames (for waveform)
258  int64_t reader_video_length = source_reader->info.video_length;
259  if (reader_video_length < 0) {
260  reader_video_length = 0;
261  }
262  float reader_duration = source_reader->info.duration;
263  double fps_value = source_reader->info.fps.ToDouble();
264  float frames_duration = 0.0f;
265  if (reader_video_length > 0 && fps_value > 0.0) {
266  frames_duration = static_cast<float>(reader_video_length / fps_value);
267  }
268  if (reader_duration <= 0.0f) {
269  reader_duration = frames_duration;
270  }
271  if (reader_duration < 0.0f) {
272  reader_duration = 0.0f;
273  }
274 
275  if (!source_reader->info.has_audio) {
276  return data;
277  }
278 
279  int total_samples = static_cast<int>(std::ceil(reader_duration * num_per_second));
280  if (total_samples <= 0 || source_reader->info.channels == 0) {
281  return data;
282  }
283 
284  if (channel != -1 && (channel < 0 || channel >= source_reader->info.channels)) {
285  return data;
286  }
287 
288  // Resize and clear audio buffers
289  data.resize(total_samples);
290  data.zero(total_samples);
291 
292  int extracted_index = 0;
293  int sample_index = 0;
294  float samples_max = 0.0f;
295  float chunk_max = 0.0f;
296  double chunk_squared_sum = 0.0;
297 
298  int channel_count = (channel == -1) ? source_reader->info.channels : 1;
299  std::vector<float*> channels(source_reader->info.channels, nullptr);
300 
301  try {
302  for (int64_t f = 1; f <= reader_video_length && extracted_index < total_samples; f++) {
303  std::shared_ptr<openshot::Frame> frame = get_frame_with_retry(f);
304 
305  for (int channel_index = 0; channel_index < source_reader->info.channels; channel_index++) {
306  if (channel == channel_index || channel == -1) {
307  channels[channel_index] = frame->GetAudioSamples(channel_index);
308  }
309  }
310 
311  int sample_count = frame->GetAudioSamplesCount();
312  for (int s = 0; s < sample_count; s++) {
313  for (int channel_index = 0; channel_index < source_reader->info.channels; channel_index++) {
314  if (channel == channel_index || channel == -1) {
315  float *samples = channels[channel_index];
316  if (!samples) {
317  continue;
318  }
319  float abs_sample = std::abs(samples[s]);
320  chunk_squared_sum += static_cast<double>(samples[s]) * static_cast<double>(samples[s]);
321  chunk_max = std::max(chunk_max, abs_sample);
322  }
323  }
324 
325  sample_index += 1;
326 
327  if (sample_index % sample_divisor == 0) {
328  float avg_squared_sum = 0.0f;
329  if (channel_count > 0) {
330  avg_squared_sum = static_cast<float>(chunk_squared_sum / static_cast<double>(sample_divisor * channel_count));
331  }
332 
333  if (extracted_index < total_samples) {
334  data.max_samples[extracted_index] = chunk_max;
335  data.rms_samples[extracted_index] = std::sqrt(avg_squared_sum);
336  samples_max = std::max(samples_max, chunk_max);
337  extracted_index++;
338  }
339 
340  sample_index = 0;
341  chunk_max = 0.0f;
342  chunk_squared_sum = 0.0;
343 
344  if (extracted_index >= total_samples) {
345  break;
346  }
347  }
348  }
349  }
350  } catch (...) {
351  throw;
352  }
353 
354  if (sample_index > 0 && extracted_index < total_samples) {
355  float avg_squared_sum = 0.0f;
356  if (channel_count > 0) {
357  avg_squared_sum = static_cast<float>(chunk_squared_sum / static_cast<double>(sample_index * channel_count));
358  }
359 
360  data.max_samples[extracted_index] = chunk_max;
361  data.rms_samples[extracted_index] = std::sqrt(avg_squared_sum);
362  samples_max = std::max(samples_max, chunk_max);
363  extracted_index++;
364  }
365 
366  if (normalize && samples_max > 0.0f) {
367  float scale = 1.0f / samples_max;
368  data.scale(total_samples, scale);
369  }
370 
371  return data;
372 }
373 
374 ReaderBase* AudioWaveformer::ResolveSourceReader(ReaderBase* source_reader) {
375  if (!source_reader) {
376  return nullptr;
377  }
378 
379  ReaderBase* current = source_reader;
380  while (true) {
381  if (auto clip = dynamic_cast<Clip*>(current)) {
382  current = clip->Reader();
383  continue;
384  }
385  if (auto mapper = dynamic_cast<FrameMapper*>(current)) {
386  current = mapper->Reader();
387  continue;
388  }
389  break;
390  }
391  return current;
392 }
393 
394 Fraction AudioWaveformer::ResolveSourceFPS(ReaderBase* source_reader) {
395  if (!source_reader) {
396  return Fraction(0, 1);
397  }
398  return source_reader->info.fps;
399 }
400 
401 // Resolve and cache the reader used for waveform extraction (prefer a detached FFmpegReader clone)
402 ReaderBase* AudioWaveformer::ResolveWaveformReader() {
403  if (source_initialized) {
404  return resolved_reader ? resolved_reader : reader;
405  }
406  source_initialized = true;
407 
408  resolved_reader = ResolveSourceReader(reader);
409 
410  // Prefer a detached, audio-only FFmpegReader clone so we never mutate the live reader used for preview.
411  if (auto ff_reader = dynamic_cast<FFmpegReader*>(resolved_reader)) {
412  const Json::Value ff_json = ff_reader->JsonValue();
413  const std::string path = ff_json.get("path", "").asString();
414  if (!path.empty()) {
415  try {
416  auto clone = std::make_unique<FFmpegReader>(path, false);
417  clone->SetJsonValue(ff_json);
418  clone->info.has_video = false; // explicitly audio-only for waveform extraction
419  detached_reader = std::move(clone);
420  resolved_reader = detached_reader.get();
421  } catch (...) {
422  // Fall back to using the original reader if cloning fails
423  detached_reader.reset();
424  resolved_reader = ResolveSourceReader(reader);
425  }
426  }
427  }
428 
429  return resolved_reader ? resolved_reader : reader;
430 }
openshot::ReaderInfo::sample_rate
int sample_rate
The number of audio samples per second (44100 is a common sample rate)
Definition: ReaderBase.h:60
openshot::AudioWaveformer::~AudioWaveformer
~AudioWaveformer()
Destructor.
Definition: AudioWaveformer.cpp:45
Clip.h
Header file for Clip class.
openshot::AudioWaveformData::rms_samples
std::vector< float > rms_samples
Definition: AudioWaveformer.h:36
openshot::ReaderBase::GetFrame
virtual std::shared_ptr< openshot::Frame > GetFrame(int64_t number)=0
openshot::AudioWaveformData::resize
void resize(int total_samples)
Resize both datasets.
Definition: AudioWaveformer.h:39
openshot
This namespace is the default namespace for all code in the openshot library.
Definition: Compressor.h:28
openshot::Clip
This class represents a clip (used to arrange readers on the timeline)
Definition: Clip.h:89
openshot::Fraction
This class represents a fraction.
Definition: Fraction.h:30
openshot::ReaderBase::info
openshot::ReaderInfo info
Information about the current media file.
Definition: ReaderBase.h:88
Timeline.h
Header file for Timeline class.
openshot::AudioWaveformData::zero
void zero(int total_samples)
Zero out # of values in both datasets.
Definition: AudioWaveformer.h:45
openshot::AudioWaveformData::scale
void scale(int total_samples, float factor)
Scale # of values by some factor.
Definition: AudioWaveformer.h:51
openshot::ReaderInfo::duration
float duration
Length of time (in seconds)
Definition: ReaderBase.h:43
openshot::ReaderInfo::has_video
bool has_video
Determines if this file has a video stream.
Definition: ReaderBase.h:40
openshot::Fraction::ToDouble
double ToDouble() const
Return this fraction as a double (i.e. 1/2 = 0.5)
Definition: Fraction.cpp:40
FrameMapper.h
Header file for the FrameMapper class.
openshot::ReaderInfo::video_length
int64_t video_length
The number of frames in the video stream.
Definition: ReaderBase.h:53
openshot::Keyframe
A Keyframe is a collection of Point instances, which is used to vary a number or property over time.
Definition: KeyFrame.h:53
openshot::ReaderBase::Open
virtual void Open()=0
Open the reader (and start consuming resources, such as images or video files)
openshot::AudioWaveformer::ExtractSamples
AudioWaveformData ExtractSamples(int channel, int num_per_second, bool normalize)
Extract audio samples from any ReaderBase class (legacy overload, now delegates to audio-only path)
Definition: AudioWaveformer.cpp:51
openshot::ReaderInfo::has_audio
bool has_audio
Determines if this file has an audio stream.
Definition: ReaderBase.h:41
openshot::ReaderBase::IsOpen
virtual bool IsOpen()=0
Determine if reader is open or closed.
openshot::Timeline
This class represents a timeline.
Definition: Timeline.h:154
openshot::FFmpegReader::Open
void Open() override
Open File - which is called by the constructor automatically.
Definition: FFmpegReader.cpp:215
openshot::FFmpegReader
This class uses the FFmpeg libraries, to open video files and audio files, and return openshot::Frame...
Definition: FFmpegReader.h:103
path
path
Definition: FFmpegWriter.cpp:1469
openshot::FrameMapper
This class creates a mapping between 2 different frame rates, applying a specific pull-down technique...
Definition: FrameMapper.h:193
openshot::FFmpegReader::Close
void Close() override
Close File.
Definition: FFmpegReader.cpp:640
openshot::AudioWaveformData
This struct holds the extracted waveform data (both the RMS root-mean-squared average,...
Definition: AudioWaveformer.h:33
openshot::Keyframe::GetLength
int64_t GetLength() const
Definition: KeyFrame.cpp:417
AudioWaveformer.h
Header file for AudioWaveformer class.
openshot::ReaderClosed
Exception when a reader is closed, and a frame is requested.
Definition: Exceptions.h:363
openshot::AudioWaveformer::ApplyKeyframes
AudioWaveformData ApplyKeyframes(const AudioWaveformData &base, const openshot::Keyframe *time_keyframe, const openshot::Keyframe *volume_keyframe, const openshot::Fraction &project_fps, const openshot::Fraction &source_fps, int source_channels, int num_per_second, int channel, bool normalize)
Apply time and volume keyframes to an existing waveform data set.
Definition: AudioWaveformer.cpp:114
openshot::AudioWaveformData::max_samples
std::vector< float > max_samples
Definition: AudioWaveformer.h:35
openshot::Keyframe::GetCount
int64_t GetCount() const
Get the number of points (i.e. # of points)
Definition: KeyFrame.cpp:424
openshot::ReaderInfo::fps
openshot::Fraction fps
Frames per second, as a fraction (i.e. 24/1 = 24 fps)
Definition: ReaderBase.h:48
openshot::ReaderBase
This abstract class is the base class, used by all readers in libopenshot.
Definition: ReaderBase.h:75
openshot::ReaderInfo::channels
int channels
The number of audio channels used in the audio stream.
Definition: ReaderBase.h:61
Exceptions.h
Header file for all Exception classes.
FFmpegReader.h
Header file for FFmpegReader class.
openshot::Keyframe::GetValue
double GetValue(int64_t index) const
Get the value at a specific index.
Definition: KeyFrame.cpp:258