hvdc¶
This dataset comes from DreamBank and is also available from the Sleep and Dream Database (SDDb). More specifically, it merges the Hall/VdC Norms: Female and Hall/VdC Norms: Male datasets from an archived copy of DreamBank hosted on Zenodo. See the krank GitHub Issue and the hvdc docs page for more info.
Setup¶
In [ ]:
Copied!
import os
from datetime import datetime, timezone
import pandas as pd
import pooch
import os
from datetime import datetime, timezone
import pandas as pd
import pooch
Load¶
In [2]:
Copied!
# Retrieve the latest datasets from GitHub releases
# OR from local file if during development of a new release.
DREAMBANK_VERSION = "v1.0.0-alpha5"
datasets_kwargs = {
"url": f"https://github.com/remrama/dreambank/releases/download/{DREAMBANK_VERSION}/datasets.csv.xz",
"known_hash": "md5:1475582e2daa1da53920df50cb9fc98e",
}
dreams_kwargs = {
"url": f"https://github.com/remrama/dreambank/releases/download/{DREAMBANK_VERSION}/dreams.csv.xz",
"known_hash": "md5:2dcab92f9d9515df174388babb5c9e5a",
"progressbar": True,
}
datasets_fname = pooch.retrieve(**datasets_kwargs)
dreams_fname = pooch.retrieve(**dreams_kwargs)
# datasets_fname = "../output/datasets.csv.xz"
# dreams_fname = "../output/dreams.csv.xz"
# Retrieve the latest datasets from GitHub releases
# OR from local file if during development of a new release.
DREAMBANK_VERSION = "v1.0.0-alpha5"
datasets_kwargs = {
"url": f"https://github.com/remrama/dreambank/releases/download/{DREAMBANK_VERSION}/datasets.csv.xz",
"known_hash": "md5:1475582e2daa1da53920df50cb9fc98e",
}
dreams_kwargs = {
"url": f"https://github.com/remrama/dreambank/releases/download/{DREAMBANK_VERSION}/dreams.csv.xz",
"known_hash": "md5:2dcab92f9d9515df174388babb5c9e5a",
"progressbar": True,
}
datasets_fname = pooch.retrieve(**datasets_kwargs)
dreams_fname = pooch.retrieve(**dreams_kwargs)
# datasets_fname = "../output/datasets.csv.xz"
# dreams_fname = "../output/dreams.csv.xz"
In [3]:
Copied!
datasets = pd.read_csv(datasets_fname)
dreams = pd.read_csv(dreams_fname)
datasets = pd.read_csv(datasets_fname)
dreams = pd.read_csv(dreams_fname)
Process¶
In [4]:
Copied!
# Reduce the dream reports to only those from the HVDC Norms datasets
DATASETS = ["norms-f", "norms-m"]
dreams = dreams[dreams["dataset"].isin(DATASETS)].copy()
# dreams = dreams.query("dataset.str.startswith")
assert dreams["metadata"].isna().all()
assert dreams["dream_id"].str.isdigit().all()
assert dreams["dream_id"].map(int).between(1, 500).all()
dreams["author_id"] = dreams["dataset"].str[-1] + "-" + dreams["dream_id"]
dreams["sex"] = dreams["dataset"].map({"norms-f": "female", "norms-m": "male"})
dreams = dreams.rename(columns={"dream_text": "report"})
dreams = dreams.reindex(columns=["author_id", "sex", "report"])
dreams = dreams.sort_values("author_id")
# Reduce the dream reports to only those from the HVDC Norms datasets
DATASETS = ["norms-f", "norms-m"]
dreams = dreams[dreams["dataset"].isin(DATASETS)].copy()
# dreams = dreams.query("dataset.str.startswith")
assert dreams["metadata"].isna().all()
assert dreams["dream_id"].str.isdigit().all()
assert dreams["dream_id"].map(int).between(1, 500).all()
dreams["author_id"] = dreams["dataset"].str[-1] + "-" + dreams["dream_id"]
dreams["sex"] = dreams["dataset"].map({"norms-f": "female", "norms-m": "male"})
dreams = dreams.rename(columns={"dream_text": "report"})
dreams = dreams.reindex(columns=["author_id", "sex", "report"])
dreams = dreams.sort_values("author_id")
In [5]:
Copied!
dreams.head()
dreams.head()
Out[5]:
| author_id | sex | report | |
|---|---|---|---|
| 33053 | f-0001 | female | I dreamed it was next summer and that I was go... |
| 33054 | f-0002 | female | I was at home but it was not our house. I saw ... |
| 33055 | f-0003 | female | I was in water like a lake up to my waist and ... |
| 33056 | f-0004 | female | I was riding a bicycle with a boy who is a stu... |
| 33057 | f-0005 | female | I was in biology lab and had a bag of cookies ... |
In [6]:
Copied!
dreams.info()
dreams.info()
<class 'pandas.core.frame.DataFrame'> Index: 981 entries, 33053 to 34033 Data columns (total 3 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 author_id 981 non-null object 1 sex 981 non-null object 2 report 981 non-null object dtypes: object(3) memory usage: 30.7+ KB
In [7]:
Copied!
dreams.describe()
dreams.describe()
Out[7]:
| author_id | sex | report | |
|---|---|---|---|
| count | 981 | 981 | 981 |
| unique | 981 | 2 | 981 |
| top | f-0001 | male | I dreamed it was next summer and that I was go... |
| freq | 1 | 491 | 1 |
Export¶
In [8]:
Copied!
assert not dreams.isna().any(axis=None)
assert not dreams.duplicated().any()
assert not dreams.duplicated(subset=["author_id"]).any()
assert dreams["sex"].isin(["female", "male"]).all()
assert (dreams["report"] == dreams["report"].str.strip()).all()
assert not dreams.isna().any(axis=None)
assert not dreams.duplicated().any()
assert not dreams.duplicated(subset=["author_id"]).any()
assert dreams["sex"].isin(["female", "male"]).all()
assert (dreams["report"] == dreams["report"].str.strip()).all()
In [9]:
Copied!
outpath = "./output/hvdc.csv"
os.makedirs(os.path.dirname(outpath), exist_ok=True)
TO_CSV_KWARGS = {
"index": False,
"na_rep": "N/A",
"sep": ",",
"mode": "x", # Switch to `w` to overwrite existing file
"encoding": "utf-8-sig", # Include sig/BOM for better compatibility with Excel
"lineterminator": "\n",
"quoting": 2, # 2 = csv.QUOTE_NONNUMERIC
"quotechar": '"',
"doublequote": True,
}
dreams.to_csv(outpath, **TO_CSV_KWARGS)
print(f"file: {os.path.basename(outpath)}")
print(f"size: {os.path.getsize(outpath) / 1e6} MB")
print(f"md5: {pooch.file_hash(outpath, alg='md5')}")
print(f"sha256: {pooch.file_hash(outpath, alg='sha256')}")
print(f"timestamp: {datetime.fromtimestamp(os.path.getmtime(outpath), tz=timezone.utc).isoformat(timespec='seconds')}")
outpath = "./output/hvdc.csv"
os.makedirs(os.path.dirname(outpath), exist_ok=True)
TO_CSV_KWARGS = {
"index": False,
"na_rep": "N/A",
"sep": ",",
"mode": "x", # Switch to `w` to overwrite existing file
"encoding": "utf-8-sig", # Include sig/BOM for better compatibility with Excel
"lineterminator": "\n",
"quoting": 2, # 2 = csv.QUOTE_NONNUMERIC
"quotechar": '"',
"doublequote": True,
}
dreams.to_csv(outpath, **TO_CSV_KWARGS)
print(f"file: {os.path.basename(outpath)}")
print(f"size: {os.path.getsize(outpath) / 1e6} MB")
print(f"md5: {pooch.file_hash(outpath, alg='md5')}")
print(f"sha256: {pooch.file_hash(outpath, alg='sha256')}")
print(f"timestamp: {datetime.fromtimestamp(os.path.getmtime(outpath), tz=timezone.utc).isoformat(timespec='seconds')}")
file: hvdc.csv size: 0.640043 MB md5: bd376d9dcc77f5369e7f4a058a1ba166 sha256: 43bddc73f0c7fa388a378f2911a15ae34ad9df17cbd1d347f35c29e3816e589a timestamp: 2025-12-29T23:15:47+00:00