meg-huggingface
commited on
Commit
·
7d16cf6
1
Parent(s):
8c1a5a1
Fixing Spaces error so this runs again.
Browse files- app.py +4 -6
- data_measurements/dataset_utils.py +76 -36
- data_measurements/streamlit_utils.py +2 -1
- requirements.txt +4 -4
app.py
CHANGED
|
@@ -76,11 +76,10 @@ _MIN_VOCAB_COUNT = 10
|
|
| 76 |
_SHOW_TOP_N_WORDS = 10
|
| 77 |
|
| 78 |
|
| 79 |
-
@st.
|
| 80 |
hash_funcs={
|
| 81 |
dataset_statistics.DatasetStatisticsCacheClass: lambda dstats: dstats.cache_path
|
| 82 |
-
}
|
| 83 |
-
allow_output_mutation=True,
|
| 84 |
)
|
| 85 |
def load_or_prepare(ds_args, show_embeddings, use_cache=False):
|
| 86 |
"""
|
|
@@ -125,11 +124,10 @@ def load_or_prepare(ds_args, show_embeddings, use_cache=False):
|
|
| 125 |
dstats.load_or_prepare_zipf()
|
| 126 |
return dstats
|
| 127 |
|
| 128 |
-
@st.
|
| 129 |
hash_funcs={
|
| 130 |
dataset_statistics.DatasetStatisticsCacheClass: lambda dstats: dstats.cache_path
|
| 131 |
-
}
|
| 132 |
-
allow_output_mutation=True,
|
| 133 |
)
|
| 134 |
def load_or_prepare_widgets(ds_args, show_embeddings, use_cache=False):
|
| 135 |
"""
|
|
|
|
| 76 |
_SHOW_TOP_N_WORDS = 10
|
| 77 |
|
| 78 |
|
| 79 |
+
@st.cache_resource(
|
| 80 |
hash_funcs={
|
| 81 |
dataset_statistics.DatasetStatisticsCacheClass: lambda dstats: dstats.cache_path
|
| 82 |
+
}
|
|
|
|
| 83 |
)
|
| 84 |
def load_or_prepare(ds_args, show_embeddings, use_cache=False):
|
| 85 |
"""
|
|
|
|
| 124 |
dstats.load_or_prepare_zipf()
|
| 125 |
return dstats
|
| 126 |
|
| 127 |
+
@st.cache_resource(
|
| 128 |
hash_funcs={
|
| 129 |
dataset_statistics.DatasetStatisticsCacheClass: lambda dstats: dstats.cache_path
|
| 130 |
+
}
|
|
|
|
| 131 |
)
|
| 132 |
def load_or_prepare_widgets(ds_args, show_embeddings, use_cache=False):
|
| 133 |
"""
|
data_measurements/dataset_utils.py
CHANGED
|
@@ -47,7 +47,7 @@ TOT_WORDS = "total words"
|
|
| 47 |
TOT_OPEN_WORDS = "total open words"
|
| 48 |
|
| 49 |
_DATASET_LIST = [
|
| 50 |
-
"c4",
|
| 51 |
"squad",
|
| 52 |
"squad_v2",
|
| 53 |
"hate_speech18",
|
|
@@ -60,7 +60,7 @@ _DATASET_LIST = [
|
|
| 60 |
]
|
| 61 |
|
| 62 |
_STREAMABLE_DATASET_LIST = [
|
| 63 |
-
"c4",
|
| 64 |
"wikitext",
|
| 65 |
"HuggingFaceM4/OBELICS",
|
| 66 |
]
|
|
@@ -119,6 +119,7 @@ def load_truncated_dataset(
|
|
| 119 |
name=config_name,
|
| 120 |
split=split_name,
|
| 121 |
streaming=True,
|
|
|
|
| 122 |
).take(num_rows)
|
| 123 |
rows = list(iterable_dataset)
|
| 124 |
f = open("temp.jsonl", "w", encoding="utf-8")
|
|
@@ -160,69 +161,108 @@ def intersect_dfs(df_dict):
|
|
| 160 |
|
| 161 |
def get_typed_features(features, ftype="string", parents=None):
|
| 162 |
"""
|
| 163 |
-
Recursively get a list of all features of a certain dtype
|
| 164 |
-
|
| 165 |
-
:
|
| 166 |
-
|
| 167 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 168 |
"""
|
| 169 |
if parents is None:
|
| 170 |
parents = []
|
| 171 |
typed_features = []
|
|
|
|
|
|
|
|
|
|
|
|
|
| 172 |
for name, feat in features.items():
|
| 173 |
if isinstance(feat, dict):
|
| 174 |
-
if
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
| 178 |
elif "feature" in feat:
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 185 |
else:
|
|
|
|
| 186 |
for k, v in feat.items():
|
| 187 |
if isinstance(v, dict):
|
| 188 |
-
typed_features
|
| 189 |
-
v, ftype, parents + [name, k]
|
| 190 |
)
|
| 191 |
elif name == "dtype" and feat == ftype:
|
| 192 |
-
|
|
|
|
|
|
|
| 193 |
return typed_features
|
| 194 |
|
| 195 |
|
| 196 |
def get_label_features(features, parents=None):
|
| 197 |
"""
|
| 198 |
-
Recursively get a list of all features that are ClassLabels
|
| 199 |
-
|
| 200 |
-
:
|
| 201 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 202 |
"""
|
| 203 |
if parents is None:
|
| 204 |
parents = []
|
| 205 |
label_features = []
|
|
|
|
|
|
|
|
|
|
|
|
|
| 206 |
for name, feat in features.items():
|
| 207 |
if isinstance(feat, dict):
|
|
|
|
| 208 |
if "names" in feat:
|
| 209 |
-
label_features
|
|
|
|
| 210 |
elif "feature" in feat:
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 219 |
else:
|
|
|
|
| 220 |
for k, v in feat.items():
|
| 221 |
if isinstance(v, dict):
|
| 222 |
-
label_features
|
|
|
|
|
|
|
| 223 |
elif name == "names":
|
| 224 |
-
|
| 225 |
-
|
|
|
|
|
|
|
| 226 |
|
| 227 |
|
| 228 |
# get the info we need for the app sidebar in dict format
|
|
|
|
| 47 |
TOT_OPEN_WORDS = "total open words"
|
| 48 |
|
| 49 |
_DATASET_LIST = [
|
| 50 |
+
"allenai/c4",
|
| 51 |
"squad",
|
| 52 |
"squad_v2",
|
| 53 |
"hate_speech18",
|
|
|
|
| 60 |
]
|
| 61 |
|
| 62 |
_STREAMABLE_DATASET_LIST = [
|
| 63 |
+
"allenai/c4",
|
| 64 |
"wikitext",
|
| 65 |
"HuggingFaceM4/OBELICS",
|
| 66 |
]
|
|
|
|
| 119 |
name=config_name,
|
| 120 |
split=split_name,
|
| 121 |
streaming=True,
|
| 122 |
+
trust_remote_code=True,
|
| 123 |
).take(num_rows)
|
| 124 |
rows = list(iterable_dataset)
|
| 125 |
f = open("temp.jsonl", "w", encoding="utf-8")
|
|
|
|
| 161 |
|
| 162 |
def get_typed_features(features, ftype="string", parents=None):
|
| 163 |
"""
|
| 164 |
+
Recursively get a list of all features of a certain dtype.
|
| 165 |
+
|
| 166 |
+
Args:
|
| 167 |
+
features: Feature dictionary from HuggingFace dataset
|
| 168 |
+
ftype: Type to search for (e.g., "string", "int32", "float32")
|
| 169 |
+
parents: List of parent feature names for nested features
|
| 170 |
+
|
| 171 |
+
Returns:
|
| 172 |
+
List of tuples representing feature paths, e.g. ('A', 'B', 'C')
|
| 173 |
+
for feature example['A']['B']['C']
|
| 174 |
"""
|
| 175 |
if parents is None:
|
| 176 |
parents = []
|
| 177 |
typed_features = []
|
| 178 |
+
|
| 179 |
+
if not isinstance(features, dict):
|
| 180 |
+
return typed_features
|
| 181 |
+
|
| 182 |
for name, feat in features.items():
|
| 183 |
if isinstance(feat, dict):
|
| 184 |
+
# Check if this feature has the target dtype directly
|
| 185 |
+
if feat.get("dtype") == ftype:
|
| 186 |
+
typed_features.append(tuple(parents + [name]))
|
| 187 |
+
# Check if nested in a "feature" key
|
| 188 |
elif "feature" in feat:
|
| 189 |
+
nested_feat = feat["feature"]
|
| 190 |
+
if isinstance(nested_feat, dict):
|
| 191 |
+
if nested_feat.get("dtype") == ftype:
|
| 192 |
+
typed_features.append(tuple(parents + [name]))
|
| 193 |
+
else:
|
| 194 |
+
# Recursively search nested feature structure
|
| 195 |
+
typed_features.extend(
|
| 196 |
+
get_typed_features(nested_feat, ftype, parents + [name])
|
| 197 |
+
)
|
| 198 |
+
elif nested_feat == ftype:
|
| 199 |
+
typed_features.append(tuple(parents + [name]))
|
| 200 |
else:
|
| 201 |
+
# Recursively search other nested dictionaries
|
| 202 |
for k, v in feat.items():
|
| 203 |
if isinstance(v, dict):
|
| 204 |
+
typed_features.extend(
|
| 205 |
+
get_typed_features(v, ftype, parents + [name, k])
|
| 206 |
)
|
| 207 |
elif name == "dtype" and feat == ftype:
|
| 208 |
+
# Handle case where dtype is a direct key
|
| 209 |
+
typed_features.append(tuple(parents))
|
| 210 |
+
|
| 211 |
return typed_features
|
| 212 |
|
| 213 |
|
| 214 |
def get_label_features(features, parents=None):
|
| 215 |
"""
|
| 216 |
+
Recursively get a list of all features that are ClassLabels.
|
| 217 |
+
|
| 218 |
+
Args:
|
| 219 |
+
features: Feature dictionary from HuggingFace dataset
|
| 220 |
+
parents: List of parent feature names for nested features
|
| 221 |
+
|
| 222 |
+
Returns:
|
| 223 |
+
List of tuples containing (feature_path, class_names) pairs,
|
| 224 |
+
e.g. (('A', 'B'), ['class1', 'class2']) for feature example['A']['B']
|
| 225 |
"""
|
| 226 |
if parents is None:
|
| 227 |
parents = []
|
| 228 |
label_features = []
|
| 229 |
+
|
| 230 |
+
if not isinstance(features, dict):
|
| 231 |
+
return label_features
|
| 232 |
+
|
| 233 |
for name, feat in features.items():
|
| 234 |
if isinstance(feat, dict):
|
| 235 |
+
# Check if this feature has "names" directly (ClassLabel feature)
|
| 236 |
if "names" in feat:
|
| 237 |
+
label_features.append((tuple(parents + [name]), feat["names"]))
|
| 238 |
+
# Check if nested in a "feature" key
|
| 239 |
elif "feature" in feat:
|
| 240 |
+
nested_feat = feat["feature"]
|
| 241 |
+
if isinstance(nested_feat, dict):
|
| 242 |
+
if "names" in nested_feat:
|
| 243 |
+
label_features.append(
|
| 244 |
+
(tuple(parents + [name]), nested_feat["names"])
|
| 245 |
+
)
|
| 246 |
+
else:
|
| 247 |
+
# Recursively search nested feature structure
|
| 248 |
+
label_features.extend(
|
| 249 |
+
get_label_features(nested_feat, parents + [name])
|
| 250 |
+
)
|
| 251 |
+
elif isinstance(nested_feat, list):
|
| 252 |
+
# Handle case where names is a list directly
|
| 253 |
+
label_features.append((tuple(parents + [name]), nested_feat))
|
| 254 |
else:
|
| 255 |
+
# Recursively search other nested dictionaries
|
| 256 |
for k, v in feat.items():
|
| 257 |
if isinstance(v, dict):
|
| 258 |
+
label_features.extend(
|
| 259 |
+
get_label_features(v, parents + [name, k])
|
| 260 |
+
)
|
| 261 |
elif name == "names":
|
| 262 |
+
# Handle case where names is a direct key
|
| 263 |
+
label_features.append((tuple(parents), feat))
|
| 264 |
+
|
| 265 |
+
return label_features
|
| 266 |
|
| 267 |
|
| 268 |
# get the info we need for the app sidebar in dict format
|
data_measurements/streamlit_utils.py
CHANGED
|
@@ -21,7 +21,8 @@ import streamlit as st
|
|
| 21 |
#from st_aggrid import AgGrid, GridOptionsBuilder
|
| 22 |
|
| 23 |
from .dataset_utils import HF_DESC_FIELD, HF_FEATURE_FIELD, HF_LABEL_FIELD
|
| 24 |
-
|
|
|
|
| 25 |
json_file_path = "cache_dir/has_cache.json"
|
| 26 |
with open(json_file_path, "r", encoding="utf-8") as j:
|
| 27 |
_HAS_CACHE = json.loads(j.read())
|
|
|
|
| 21 |
#from st_aggrid import AgGrid, GridOptionsBuilder
|
| 22 |
|
| 23 |
from .dataset_utils import HF_DESC_FIELD, HF_FEATURE_FIELD, HF_LABEL_FIELD
|
| 24 |
+
# Note: deprecation.showPyplotGlobalUse was removed in Streamlit 1.32.0+
|
| 25 |
+
# st.set_option('deprecation.showPyplotGlobalUse', False)
|
| 26 |
json_file_path = "cache_dir/has_cache.json"
|
| 27 |
with open(json_file_path, "r", encoding="utf-8") as j:
|
| 28 |
_HAS_CACHE = json.loads(j.read())
|
requirements.txt
CHANGED
|
@@ -16,7 +16,7 @@ sentencepiece==0.1.96
|
|
| 16 |
|
| 17 |
iso_639==0.4.5
|
| 18 |
|
| 19 |
-
datasets
|
| 20 |
|
| 21 |
powerlaw==1.5
|
| 22 |
|
|
@@ -39,12 +39,12 @@ streamlit-aggrid
|
|
| 39 |
numexpr
|
| 40 |
|
| 41 |
scikit-learn>=0.24.2
|
| 42 |
-
tqdm
|
| 43 |
|
| 44 |
pyarrow
|
| 45 |
-
altair
|
| 46 |
|
| 47 |
|
| 48 |
scipy
|
| 49 |
|
| 50 |
-
streamlit
|
|
|
|
| 16 |
|
| 17 |
iso_639==0.4.5
|
| 18 |
|
| 19 |
+
datasets>=2.8.0,<4.0.0
|
| 20 |
|
| 21 |
powerlaw==1.5
|
| 22 |
|
|
|
|
| 39 |
numexpr
|
| 40 |
|
| 41 |
scikit-learn>=0.24.2
|
| 42 |
+
tqdm>=4.62.3
|
| 43 |
|
| 44 |
pyarrow
|
| 45 |
+
altair>=5.0.0
|
| 46 |
|
| 47 |
|
| 48 |
scipy
|
| 49 |
|
| 50 |
+
streamlit>=1.24.1
|