Spaces:

huggingface
/

data-measurements-tool

Running

App Files Files Community

meg-huggingface commited on Jan 8

Commit

7d16cf6

1 Parent(s): 8c1a5a1

Fixing Spaces error so this runs again.

Browse files

Files changed (4) hide show

app.py +4 -6
data_measurements/dataset_utils.py +76 -36
data_measurements/streamlit_utils.py +2 -1
requirements.txt +4 -4

app.py CHANGED Viewed

@@ -76,11 +76,10 @@ _MIN_VOCAB_COUNT = 10
 _SHOW_TOP_N_WORDS = 10
-@st.cache(
     hash_funcs={
         dataset_statistics.DatasetStatisticsCacheClass: lambda dstats: dstats.cache_path
-    },
-    allow_output_mutation=True,
 )
 def load_or_prepare(ds_args, show_embeddings, use_cache=False):
     """
@@ -125,11 +124,10 @@ def load_or_prepare(ds_args, show_embeddings, use_cache=False):
     dstats.load_or_prepare_zipf()
     return dstats
-@st.cache(
     hash_funcs={
         dataset_statistics.DatasetStatisticsCacheClass: lambda dstats: dstats.cache_path
-    },
-    allow_output_mutation=True,
 )
 def load_or_prepare_widgets(ds_args, show_embeddings, use_cache=False):
     """

 _SHOW_TOP_N_WORDS = 10
+@st.cache_resource(
     hash_funcs={
         dataset_statistics.DatasetStatisticsCacheClass: lambda dstats: dstats.cache_path
+    }
 )
 def load_or_prepare(ds_args, show_embeddings, use_cache=False):
     """
     dstats.load_or_prepare_zipf()
     return dstats
+@st.cache_resource(
     hash_funcs={
         dataset_statistics.DatasetStatisticsCacheClass: lambda dstats: dstats.cache_path
+    }
 )
 def load_or_prepare_widgets(ds_args, show_embeddings, use_cache=False):
     """

data_measurements/dataset_utils.py CHANGED Viewed

@@ -47,7 +47,7 @@ TOT_WORDS = "total words"
 TOT_OPEN_WORDS = "total open words"
 _DATASET_LIST = [
-    "c4",
     "squad",
     "squad_v2",
     "hate_speech18",
@@ -60,7 +60,7 @@ _DATASET_LIST = [
 ]
 _STREAMABLE_DATASET_LIST = [
-    "c4",
     "wikitext",
     "HuggingFaceM4/OBELICS",
 ]
@@ -119,6 +119,7 @@ def load_truncated_dataset(
                 name=config_name,
                 split=split_name,
                 streaming=True,
             ).take(num_rows)
             rows = list(iterable_dataset)
             f = open("temp.jsonl", "w", encoding="utf-8")
@@ -160,69 +161,108 @@ def intersect_dfs(df_dict):
 def get_typed_features(features, ftype="string", parents=None):
     """
-    Recursively get a list of all features of a certain dtype
-    :param features:
-    :param ftype:
-    :param parents:
-    :return: a list of tuples > e.g. ('A', 'B', 'C') for feature example['A']['B']['C']
     """
     if parents is None:
         parents = []
     typed_features = []
     for name, feat in features.items():
         if isinstance(feat, dict):
-            if feat.get("dtype", None) == ftype or feat.get("feature", {}).get(
-                ("dtype", None) == ftype
-            ):
-                typed_features += [tuple(parents + [name])]
             elif "feature" in feat:
-                if feat["feature"].get("dtype", None) == ftype:
-                    typed_features += [tuple(parents + [name])]
-                elif isinstance(feat["feature"], dict):
-                    typed_features += get_typed_features(
-                        feat["feature"], ftype, parents + [name]
-                    )
             else:
                 for k, v in feat.items():
                     if isinstance(v, dict):
-                        typed_features += get_typed_features(
-                            v, ftype, parents + [name, k]
                         )
         elif name == "dtype" and feat == ftype:
-            typed_features += [tuple(parents)]
     return typed_features
 def get_label_features(features, parents=None):
     """
-    Recursively get a list of all features that are ClassLabels
-    :param features:
-    :param parents:
-    :return: pairs of tuples as above and the list of class names
     """
     if parents is None:
         parents = []
     label_features = []
     for name, feat in features.items():
         if isinstance(feat, dict):
             if "names" in feat:
-                label_features += [(tuple(parents + [name]), feat["names"])]
             elif "feature" in feat:
-                if "names" in feat:
-                    label_features += [
-                        (tuple(parents + [name]), feat["feature"]["names"])
-                    ]
-                elif isinstance(feat["feature"], dict):
-                    label_features += get_label_features(
-                        feat["feature"], parents + [name]
-                    )
             else:
                 for k, v in feat.items():
                     if isinstance(v, dict):
-                        label_features += get_label_features(v, parents + [name, k])
         elif name == "names":
-            label_features += [(tuple(parents), feat)]
-    return label_features
 # get the info we need for the app sidebar in dict format

 TOT_OPEN_WORDS = "total open words"
 _DATASET_LIST = [
+    "allenai/c4",
     "squad",
     "squad_v2",
     "hate_speech18",
 ]
 _STREAMABLE_DATASET_LIST = [
+    "allenai/c4",
     "wikitext",
     "HuggingFaceM4/OBELICS",
 ]
                 name=config_name,
                 split=split_name,
                 streaming=True,
+                trust_remote_code=True,
             ).take(num_rows)
             rows = list(iterable_dataset)
             f = open("temp.jsonl", "w", encoding="utf-8")
 def get_typed_features(features, ftype="string", parents=None):
     """
+    Recursively get a list of all features of a certain dtype.
+    Args:
+        features: Feature dictionary from HuggingFace dataset
+        ftype: Type to search for (e.g., "string", "int32", "float32")
+        parents: List of parent feature names for nested features
+    Returns:
+        List of tuples representing feature paths, e.g. ('A', 'B', 'C')
+        for feature example['A']['B']['C']
     """
     if parents is None:
         parents = []
     typed_features = []
+    if not isinstance(features, dict):
+        return typed_features
     for name, feat in features.items():
         if isinstance(feat, dict):
+            # Check if this feature has the target dtype directly
+            if feat.get("dtype") == ftype:
+                typed_features.append(tuple(parents + [name]))
+            # Check if nested in a "feature" key
             elif "feature" in feat:
+                nested_feat = feat["feature"]
+                if isinstance(nested_feat, dict):
+                    if nested_feat.get("dtype") == ftype:
+                        typed_features.append(tuple(parents + [name]))
+                    else:
+                        # Recursively search nested feature structure
+                        typed_features.extend(
+                            get_typed_features(nested_feat, ftype, parents + [name])
+                        )
+                elif nested_feat == ftype:
+                    typed_features.append(tuple(parents + [name]))
             else:
+                # Recursively search other nested dictionaries
                 for k, v in feat.items():
                     if isinstance(v, dict):
+                        typed_features.extend(
+                            get_typed_features(v, ftype, parents + [name, k])
                         )
         elif name == "dtype" and feat == ftype:
+            # Handle case where dtype is a direct key
+            typed_features.append(tuple(parents))
     return typed_features
 def get_label_features(features, parents=None):
     """
+    Recursively get a list of all features that are ClassLabels.
+    Args:
+        features: Feature dictionary from HuggingFace dataset
+        parents: List of parent feature names for nested features
+    Returns:
+        List of tuples containing (feature_path, class_names) pairs,
+        e.g. (('A', 'B'), ['class1', 'class2']) for feature example['A']['B']
     """
     if parents is None:
         parents = []
     label_features = []
+    if not isinstance(features, dict):
+        return label_features
     for name, feat in features.items():
         if isinstance(feat, dict):
+            # Check if this feature has "names" directly (ClassLabel feature)
             if "names" in feat:
+                label_features.append((tuple(parents + [name]), feat["names"]))
+            # Check if nested in a "feature" key
             elif "feature" in feat:
+                nested_feat = feat["feature"]
+                if isinstance(nested_feat, dict):
+                    if "names" in nested_feat:
+                        label_features.append(
+                            (tuple(parents + [name]), nested_feat["names"])
+                        )
+                    else:
+                        # Recursively search nested feature structure
+                        label_features.extend(
+                            get_label_features(nested_feat, parents + [name])
+                        )
+                elif isinstance(nested_feat, list):
+                    # Handle case where names is a list directly
+                    label_features.append((tuple(parents + [name]), nested_feat))
             else:
+                # Recursively search other nested dictionaries
                 for k, v in feat.items():
                     if isinstance(v, dict):
+                        label_features.extend(
+                            get_label_features(v, parents + [name, k])
+                        )
         elif name == "names":
+            # Handle case where names is a direct key
+            label_features.append((tuple(parents), feat))
+    return label_features
 # get the info we need for the app sidebar in dict format

data_measurements/streamlit_utils.py CHANGED Viewed

@@ -21,7 +21,8 @@ import streamlit as st
 #from st_aggrid import AgGrid, GridOptionsBuilder
 from .dataset_utils import HF_DESC_FIELD, HF_FEATURE_FIELD, HF_LABEL_FIELD
-st.set_option('deprecation.showPyplotGlobalUse', False)
 json_file_path = "cache_dir/has_cache.json"
 with open(json_file_path, "r", encoding="utf-8") as j:
     _HAS_CACHE = json.loads(j.read())

 #from st_aggrid import AgGrid, GridOptionsBuilder
 from .dataset_utils import HF_DESC_FIELD, HF_FEATURE_FIELD, HF_LABEL_FIELD
+# Note: deprecation.showPyplotGlobalUse was removed in Streamlit 1.32.0+
+# st.set_option('deprecation.showPyplotGlobalUse', False)
 json_file_path = "cache_dir/has_cache.json"
 with open(json_file_path, "r", encoding="utf-8") as j:
     _HAS_CACHE = json.loads(j.read())

requirements.txt CHANGED Viewed

@@ -16,7 +16,7 @@ sentencepiece==0.1.96
 iso_639==0.4.5
-datasets==2.8.0
 powerlaw==1.5
@@ -39,12 +39,12 @@ streamlit-aggrid
 numexpr
 scikit-learn>=0.24.2
-tqdm~=4.62.3
 pyarrow
-altair<5
 scipy
-streamlit==1.24.1

 iso_639==0.4.5
+datasets>=2.8.0,<4.0.0
 powerlaw==1.5
 numexpr
 scikit-learn>=0.24.2
+tqdm>=4.62.3
 pyarrow
+altair>=5.0.0
 scipy
+streamlit>=1.24.1