AB739 commited on
Commit
733ce78
·
verified ·
1 Parent(s): 128d765

Create audio.py

Browse files
Files changed (1) hide show
  1. tasks/audio.py +255 -0
tasks/audio.py ADDED
@@ -0,0 +1,255 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import APIRouter
2
+ from datetime import datetime
3
+ from datasets import load_dataset
4
+ from sklearn.metrics import accuracy_score
5
+ import os
6
+ import torch
7
+ from torch import nn
8
+ from torch.utils.data import DataLoader, TensorDataset
9
+ from torchaudio import transforms
10
+ from torchvision import models
11
+
12
+ from .utils.evaluation import AudioEvaluationRequest
13
+ from .utils.emissions import tracker, clean_emissions_data, get_space_info
14
+
15
+ from dotenv import load_dotenv
16
+ load_dotenv()
17
+
18
+ router = APIRouter()
19
+
20
+ DESCRIPTION = "ResNet18 Model"
21
+ ROUTE = "/audio"
22
+
23
+
24
+
25
+ @router.post(ROUTE, tags=["Audio Task"],
26
+ description=DESCRIPTION)
27
+ async def evaluate_audio(request: AudioEvaluationRequest):
28
+ """
29
+ Evaluate audio classification for rainforest sound detection.
30
+
31
+ Current Model: Random Baseline
32
+ - Makes random predictions from the label space (0-1)
33
+ - Used as a baseline for comparison
34
+ """
35
+ # Get space info
36
+ username, space_url = get_space_info()
37
+
38
+ # Define the label mapping
39
+ LABEL_MAPPING = {
40
+ "chainsaw": 0,
41
+ "environment": 1
42
+ }
43
+ # Load and prepare the dataset
44
+ # Because the dataset is gated, we need to use the HF_TOKEN environment variable to authenticate
45
+ dataset = load_dataset(request.dataset_name,token=os.getenv("HF_TOKEN"))
46
+
47
+ # Split dataset
48
+ train_test = dataset["train"].train_test_split(test_size=request.test_size, seed=request.test_seed)
49
+ test_dataset = train_test["test"]
50
+
51
+ # Start tracking emissions
52
+ tracker.start()
53
+ tracker.start_task("inference")
54
+
55
+ #--------------------------------------------------------------------------------------------
56
+ # YOUR MODEL INFERENCE CODE HERE
57
+ # Update the code below to replace the random baseline by your model inference within the inference pass where the energy consumption and emissions are tracked.
58
+ #--------------------------------------------------------------------------------------------
59
+
60
+ # Make predictions using M5 Model
61
+ true_labels = test_dataset["label"]
62
+
63
+ resampler = transforms.Resample(orig_freq=12000, new_freq=16000)
64
+ mel_transform = transforms.MelSpectrogram(sample_rate=16000, n_mels=64)
65
+ amplitude_to_db = transforms.AmplitudeToDB()
66
+
67
+ def resize_audio(_waveform, target_length):
68
+ """Resizes the audio waveform to the target length using resampling"""
69
+ num_frames = _waveform.shape[-1]
70
+ if num_frames != target_length:
71
+ _resampler = transforms.Resample(orig_freq=num_frames, new_freq=target_length)
72
+ _waveform = _resampler(_waveform)
73
+ return _waveform
74
+
75
+ resized_waveforms = [
76
+ resize_audio(torch.tensor(sample['audio']['array'], dtype=torch.float32).unsqueeze(0), target_length=72000)
77
+ for sample in test_dataset
78
+ ]
79
+
80
+ waveforms, labels = [], []
81
+ for waveform, label in zip(resized_waveforms, true_labels):
82
+ waveforms.append(amplitude_to_db(mel_transform(resampler(waveform))))
83
+ labels.append(label)
84
+
85
+ waveforms = torch.stack(waveforms)
86
+ labels = torch.tensor(labels)
87
+
88
+ test_loader = DataLoader(
89
+ TensorDataset(waveforms, labels),
90
+ batch_size=32,
91
+ shuffle=False
92
+ )
93
+
94
+ class ResNetForAudio(nn.Module):
95
+ def __init__(self, num_classes=10):
96
+ super(ResNetForAudio, self).__init__()
97
+ self.resnet = models.resnet18(pretrained=False)
98
+ self.resnet.conv1 = nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3, bias=False)
99
+ self.resnet.fc = nn.Linear(self.resnet.fc.in_features, num_classes)
100
+
101
+ def forward(self, x):
102
+ return self.resnet(x)
103
+
104
+ #model = ResNetForAudio(num_classes=2)
105
+
106
+
107
+
108
+ class BlazeFace(nn.Module):
109
+ def __init__(self, input_channels=1, use_double_block=False, activation="relu", use_optional_block=True):
110
+ super(BlazeFace, self).__init__()
111
+ self.activation = activation
112
+ self.use_double_block = use_double_block
113
+ self.use_optional_block = use_optional_block
114
+
115
+ def conv_block(in_channels, out_channels, kernel_size, stride, padding):
116
+ return nn.Sequential(
117
+ nn.Conv2d(in_channels, out_channels, kernel_size=kernel_size, stride=stride, padding=padding),
118
+ nn.BatchNorm2d(out_channels),
119
+ nn.ReLU() if activation == "relu" else nn.Sigmoid() # Apply ReLU activation (default) or Sigmoid
120
+ )
121
+
122
+ def depthwise_separable_block(in_channels, out_channels, stride):
123
+ return nn.Sequential(
124
+ nn.Conv2d(in_channels, in_channels, kernel_size=5, stride=stride, padding=2, groups=in_channels, bias=False),
125
+ nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, padding=0),
126
+ nn.BatchNorm2d(out_channels),
127
+ nn.ReLU() if activation == "relu" else nn.Sigmoid()
128
+ )
129
+
130
+ def double_block(in_channels, filters_1, filters_2, stride):
131
+ return nn.Sequential(
132
+ depthwise_separable_block(in_channels, filters_1, stride),
133
+ depthwise_separable_block(filters_1, filters_2, 1)
134
+ )
135
+
136
+ # Define layers (first part: conv layers)
137
+ self.conv1 = conv_block(input_channels, 24, kernel_size=5, stride=2, padding=2)
138
+
139
+ # Define single blocks (subsequent conv blocks)
140
+ self.single_blocks = nn.ModuleList([
141
+ depthwise_separable_block(24, 24, stride=1),
142
+ depthwise_separable_block(24, 24, stride=1),
143
+ depthwise_separable_block(24, 48, stride=2),
144
+ depthwise_separable_block(48, 48, stride=1),
145
+ depthwise_separable_block(48, 48, stride=1)
146
+ ])
147
+
148
+ # Define double blocks if `use_double_block` is True
149
+ if self.use_double_block:
150
+ self.double_blocks = nn.ModuleList([
151
+ double_block(48, 24, 96, stride=2),
152
+ double_block(96, 24, 96, stride=1),
153
+ double_block(96, 24, 96, stride=2),
154
+ double_block(96, 24, 96, stride=1),
155
+ double_block(96, 24, 96, stride=2)
156
+ ])
157
+ else:
158
+ self.double_blocks = nn.ModuleList([
159
+ depthwise_separable_block(48, 96, stride=2),
160
+ depthwise_separable_block(96, 96, stride=1),
161
+ depthwise_separable_block(96, 96, stride=2),
162
+ depthwise_separable_block(96, 96, stride=1),
163
+ depthwise_separable_block(96, 96, stride=2)
164
+ ])
165
+
166
+ # Final convolutional head
167
+ self.conv_head = nn.Conv2d(96, 64, kernel_size=1, stride=1)
168
+ self.bn_head = nn.BatchNorm2d(64)
169
+
170
+ # Global Average Pooling
171
+ self.global_avg_pooling = nn.AdaptiveAvgPool2d(1)
172
+
173
+ def forward(self, x):
174
+ # First conv layer
175
+ x = self.conv1(x)
176
+
177
+ # Apply single blocks
178
+ for block in self.single_blocks:
179
+ x = block(x)
180
+
181
+ # Apply double blocks
182
+ for block in self.double_blocks:
183
+ x = block(x)
184
+
185
+ # Final head
186
+ x = self.conv_head(x)
187
+ x = self.bn_head(x)
188
+ x = F.relu(x)
189
+
190
+ # Global Average Pooling and Flatten
191
+ x = self.global_avg_pooling(x)
192
+ x = torch.flatten(x, 1)
193
+
194
+ return x
195
+
196
+ class BlazeFaceModel(nn.Module):
197
+ def __init__(self, input_channels, label_count, use_double_block=False, activation="relu", use_optional_block=True):
198
+ super(BlazeFaceModel, self).__init__()
199
+ self.blazeface_backbone = BlazeFace(input_channels=input_channels, use_double_block=use_double_block, activation=activation, use_optional_block=use_optional_block)
200
+ self.fc = nn.Linear(64, label_count) # 64 is the output feature size after pooling
201
+
202
+ def forward(self, x):
203
+ features = self.blazeface_backbone(x)
204
+ output = self.fc(features)
205
+ return output
206
+
207
+ # Example Usage
208
+ model_settings = {
209
+ 'spectrogram_length': 64,
210
+ 'dct_coefficient_count': 481,
211
+ 'label_count': 2
212
+ }
213
+
214
+ # Create model
215
+ model = BlazeFaceModel(input_channels=1, label_count=model_settings['label_count'], use_double_block=False, activation='relu', use_optional_block=False)
216
+
217
+
218
+ model.load_state_dict(torch.load("./best_blazeface_model.pth", map_location=torch.device('cpu')))
219
+
220
+ predictions = []
221
+ with torch.inference_mode():
222
+ for data, target in test_loader:
223
+ output = model(data).squeeze()
224
+ pred = torch.argmax(output, dim=-1)
225
+ predictions.extend(pred.tolist())
226
+
227
+ #--------------------------------------------------------------------------------------------
228
+ # YOUR MODEL INFERENCE STOPS HERE
229
+ #--------------------------------------------------------------------------------------------
230
+
231
+ # Stop tracking emissions
232
+ emissions_data = tracker.stop_task()
233
+
234
+ # Calculate accuracy
235
+ accuracy = accuracy_score(true_labels, predictions)
236
+
237
+ # Prepare results dictionary
238
+ results = {
239
+ "username": username,
240
+ "space_url": space_url,
241
+ "submission_timestamp": datetime.now().isoformat(),
242
+ "model_description": DESCRIPTION,
243
+ "accuracy": float(accuracy),
244
+ "energy_consumed_wh": emissions_data.energy_consumed * 1000,
245
+ "emissions_gco2eq": emissions_data.emissions * 1000,
246
+ "emissions_data": clean_emissions_data(emissions_data),
247
+ "api_route": ROUTE,
248
+ "dataset_config": {
249
+ "dataset_name": request.dataset_name,
250
+ "test_size": request.test_size,
251
+ "test_seed": request.test_seed
252
+ }
253
+ }
254
+
255
+ return results