meg-huggingface commited on
Commit
7d16cf6
·
1 Parent(s): 8c1a5a1

Fixing Spaces error so this runs again.

Browse files
app.py CHANGED
@@ -76,11 +76,10 @@ _MIN_VOCAB_COUNT = 10
76
  _SHOW_TOP_N_WORDS = 10
77
 
78
 
79
- @st.cache(
80
  hash_funcs={
81
  dataset_statistics.DatasetStatisticsCacheClass: lambda dstats: dstats.cache_path
82
- },
83
- allow_output_mutation=True,
84
  )
85
  def load_or_prepare(ds_args, show_embeddings, use_cache=False):
86
  """
@@ -125,11 +124,10 @@ def load_or_prepare(ds_args, show_embeddings, use_cache=False):
125
  dstats.load_or_prepare_zipf()
126
  return dstats
127
 
128
- @st.cache(
129
  hash_funcs={
130
  dataset_statistics.DatasetStatisticsCacheClass: lambda dstats: dstats.cache_path
131
- },
132
- allow_output_mutation=True,
133
  )
134
  def load_or_prepare_widgets(ds_args, show_embeddings, use_cache=False):
135
  """
 
76
  _SHOW_TOP_N_WORDS = 10
77
 
78
 
79
+ @st.cache_resource(
80
  hash_funcs={
81
  dataset_statistics.DatasetStatisticsCacheClass: lambda dstats: dstats.cache_path
82
+ }
 
83
  )
84
  def load_or_prepare(ds_args, show_embeddings, use_cache=False):
85
  """
 
124
  dstats.load_or_prepare_zipf()
125
  return dstats
126
 
127
+ @st.cache_resource(
128
  hash_funcs={
129
  dataset_statistics.DatasetStatisticsCacheClass: lambda dstats: dstats.cache_path
130
+ }
 
131
  )
132
  def load_or_prepare_widgets(ds_args, show_embeddings, use_cache=False):
133
  """
data_measurements/dataset_utils.py CHANGED
@@ -47,7 +47,7 @@ TOT_WORDS = "total words"
47
  TOT_OPEN_WORDS = "total open words"
48
 
49
  _DATASET_LIST = [
50
- "c4",
51
  "squad",
52
  "squad_v2",
53
  "hate_speech18",
@@ -60,7 +60,7 @@ _DATASET_LIST = [
60
  ]
61
 
62
  _STREAMABLE_DATASET_LIST = [
63
- "c4",
64
  "wikitext",
65
  "HuggingFaceM4/OBELICS",
66
  ]
@@ -119,6 +119,7 @@ def load_truncated_dataset(
119
  name=config_name,
120
  split=split_name,
121
  streaming=True,
 
122
  ).take(num_rows)
123
  rows = list(iterable_dataset)
124
  f = open("temp.jsonl", "w", encoding="utf-8")
@@ -160,69 +161,108 @@ def intersect_dfs(df_dict):
160
 
161
  def get_typed_features(features, ftype="string", parents=None):
162
  """
163
- Recursively get a list of all features of a certain dtype
164
- :param features:
165
- :param ftype:
166
- :param parents:
167
- :return: a list of tuples > e.g. ('A', 'B', 'C') for feature example['A']['B']['C']
 
 
 
 
 
168
  """
169
  if parents is None:
170
  parents = []
171
  typed_features = []
 
 
 
 
172
  for name, feat in features.items():
173
  if isinstance(feat, dict):
174
- if feat.get("dtype", None) == ftype or feat.get("feature", {}).get(
175
- ("dtype", None) == ftype
176
- ):
177
- typed_features += [tuple(parents + [name])]
178
  elif "feature" in feat:
179
- if feat["feature"].get("dtype", None) == ftype:
180
- typed_features += [tuple(parents + [name])]
181
- elif isinstance(feat["feature"], dict):
182
- typed_features += get_typed_features(
183
- feat["feature"], ftype, parents + [name]
184
- )
 
 
 
 
 
185
  else:
 
186
  for k, v in feat.items():
187
  if isinstance(v, dict):
188
- typed_features += get_typed_features(
189
- v, ftype, parents + [name, k]
190
  )
191
  elif name == "dtype" and feat == ftype:
192
- typed_features += [tuple(parents)]
 
 
193
  return typed_features
194
 
195
 
196
  def get_label_features(features, parents=None):
197
  """
198
- Recursively get a list of all features that are ClassLabels
199
- :param features:
200
- :param parents:
201
- :return: pairs of tuples as above and the list of class names
 
 
 
 
 
202
  """
203
  if parents is None:
204
  parents = []
205
  label_features = []
 
 
 
 
206
  for name, feat in features.items():
207
  if isinstance(feat, dict):
 
208
  if "names" in feat:
209
- label_features += [(tuple(parents + [name]), feat["names"])]
 
210
  elif "feature" in feat:
211
- if "names" in feat:
212
- label_features += [
213
- (tuple(parents + [name]), feat["feature"]["names"])
214
- ]
215
- elif isinstance(feat["feature"], dict):
216
- label_features += get_label_features(
217
- feat["feature"], parents + [name]
218
- )
 
 
 
 
 
 
219
  else:
 
220
  for k, v in feat.items():
221
  if isinstance(v, dict):
222
- label_features += get_label_features(v, parents + [name, k])
 
 
223
  elif name == "names":
224
- label_features += [(tuple(parents), feat)]
225
- return label_features
 
 
226
 
227
 
228
  # get the info we need for the app sidebar in dict format
 
47
  TOT_OPEN_WORDS = "total open words"
48
 
49
  _DATASET_LIST = [
50
+ "allenai/c4",
51
  "squad",
52
  "squad_v2",
53
  "hate_speech18",
 
60
  ]
61
 
62
  _STREAMABLE_DATASET_LIST = [
63
+ "allenai/c4",
64
  "wikitext",
65
  "HuggingFaceM4/OBELICS",
66
  ]
 
119
  name=config_name,
120
  split=split_name,
121
  streaming=True,
122
+ trust_remote_code=True,
123
  ).take(num_rows)
124
  rows = list(iterable_dataset)
125
  f = open("temp.jsonl", "w", encoding="utf-8")
 
161
 
162
  def get_typed_features(features, ftype="string", parents=None):
163
  """
164
+ Recursively get a list of all features of a certain dtype.
165
+
166
+ Args:
167
+ features: Feature dictionary from HuggingFace dataset
168
+ ftype: Type to search for (e.g., "string", "int32", "float32")
169
+ parents: List of parent feature names for nested features
170
+
171
+ Returns:
172
+ List of tuples representing feature paths, e.g. ('A', 'B', 'C')
173
+ for feature example['A']['B']['C']
174
  """
175
  if parents is None:
176
  parents = []
177
  typed_features = []
178
+
179
+ if not isinstance(features, dict):
180
+ return typed_features
181
+
182
  for name, feat in features.items():
183
  if isinstance(feat, dict):
184
+ # Check if this feature has the target dtype directly
185
+ if feat.get("dtype") == ftype:
186
+ typed_features.append(tuple(parents + [name]))
187
+ # Check if nested in a "feature" key
188
  elif "feature" in feat:
189
+ nested_feat = feat["feature"]
190
+ if isinstance(nested_feat, dict):
191
+ if nested_feat.get("dtype") == ftype:
192
+ typed_features.append(tuple(parents + [name]))
193
+ else:
194
+ # Recursively search nested feature structure
195
+ typed_features.extend(
196
+ get_typed_features(nested_feat, ftype, parents + [name])
197
+ )
198
+ elif nested_feat == ftype:
199
+ typed_features.append(tuple(parents + [name]))
200
  else:
201
+ # Recursively search other nested dictionaries
202
  for k, v in feat.items():
203
  if isinstance(v, dict):
204
+ typed_features.extend(
205
+ get_typed_features(v, ftype, parents + [name, k])
206
  )
207
  elif name == "dtype" and feat == ftype:
208
+ # Handle case where dtype is a direct key
209
+ typed_features.append(tuple(parents))
210
+
211
  return typed_features
212
 
213
 
214
  def get_label_features(features, parents=None):
215
  """
216
+ Recursively get a list of all features that are ClassLabels.
217
+
218
+ Args:
219
+ features: Feature dictionary from HuggingFace dataset
220
+ parents: List of parent feature names for nested features
221
+
222
+ Returns:
223
+ List of tuples containing (feature_path, class_names) pairs,
224
+ e.g. (('A', 'B'), ['class1', 'class2']) for feature example['A']['B']
225
  """
226
  if parents is None:
227
  parents = []
228
  label_features = []
229
+
230
+ if not isinstance(features, dict):
231
+ return label_features
232
+
233
  for name, feat in features.items():
234
  if isinstance(feat, dict):
235
+ # Check if this feature has "names" directly (ClassLabel feature)
236
  if "names" in feat:
237
+ label_features.append((tuple(parents + [name]), feat["names"]))
238
+ # Check if nested in a "feature" key
239
  elif "feature" in feat:
240
+ nested_feat = feat["feature"]
241
+ if isinstance(nested_feat, dict):
242
+ if "names" in nested_feat:
243
+ label_features.append(
244
+ (tuple(parents + [name]), nested_feat["names"])
245
+ )
246
+ else:
247
+ # Recursively search nested feature structure
248
+ label_features.extend(
249
+ get_label_features(nested_feat, parents + [name])
250
+ )
251
+ elif isinstance(nested_feat, list):
252
+ # Handle case where names is a list directly
253
+ label_features.append((tuple(parents + [name]), nested_feat))
254
  else:
255
+ # Recursively search other nested dictionaries
256
  for k, v in feat.items():
257
  if isinstance(v, dict):
258
+ label_features.extend(
259
+ get_label_features(v, parents + [name, k])
260
+ )
261
  elif name == "names":
262
+ # Handle case where names is a direct key
263
+ label_features.append((tuple(parents), feat))
264
+
265
+ return label_features
266
 
267
 
268
  # get the info we need for the app sidebar in dict format
data_measurements/streamlit_utils.py CHANGED
@@ -21,7 +21,8 @@ import streamlit as st
21
  #from st_aggrid import AgGrid, GridOptionsBuilder
22
 
23
  from .dataset_utils import HF_DESC_FIELD, HF_FEATURE_FIELD, HF_LABEL_FIELD
24
- st.set_option('deprecation.showPyplotGlobalUse', False)
 
25
  json_file_path = "cache_dir/has_cache.json"
26
  with open(json_file_path, "r", encoding="utf-8") as j:
27
  _HAS_CACHE = json.loads(j.read())
 
21
  #from st_aggrid import AgGrid, GridOptionsBuilder
22
 
23
  from .dataset_utils import HF_DESC_FIELD, HF_FEATURE_FIELD, HF_LABEL_FIELD
24
+ # Note: deprecation.showPyplotGlobalUse was removed in Streamlit 1.32.0+
25
+ # st.set_option('deprecation.showPyplotGlobalUse', False)
26
  json_file_path = "cache_dir/has_cache.json"
27
  with open(json_file_path, "r", encoding="utf-8") as j:
28
  _HAS_CACHE = json.loads(j.read())
requirements.txt CHANGED
@@ -16,7 +16,7 @@ sentencepiece==0.1.96
16
 
17
  iso_639==0.4.5
18
 
19
- datasets==2.8.0
20
 
21
  powerlaw==1.5
22
 
@@ -39,12 +39,12 @@ streamlit-aggrid
39
  numexpr
40
 
41
  scikit-learn>=0.24.2
42
- tqdm~=4.62.3
43
 
44
  pyarrow
45
- altair<5
46
 
47
 
48
  scipy
49
 
50
- streamlit==1.24.1
 
16
 
17
  iso_639==0.4.5
18
 
19
+ datasets>=2.8.0,<4.0.0
20
 
21
  powerlaw==1.5
22
 
 
39
  numexpr
40
 
41
  scikit-learn>=0.24.2
42
+ tqdm>=4.62.3
43
 
44
  pyarrow
45
+ altair>=5.0.0
46
 
47
 
48
  scipy
49
 
50
+ streamlit>=1.24.1