ucsc1996¶
This dataset comes from DreamBank. It is the UCSC women, 1996 dataset and comes from an archived copy of DreamBank hosted on Zenodo. See the krank GitHub Issue and the ucsc1996 docs page for more info.
Setup¶
In [ ]:
Copied!
import os
from datetime import datetime, timezone
import pandas as pd
import pooch
import os
from datetime import datetime, timezone
import pandas as pd
import pooch
In [2]:
Copied!
DATASETS = ["ucsc_women"]
DATASETS = ["ucsc_women"]
Load¶
In [3]:
Copied!
# Retrieve the latest datasets from GitHub releases
# OR from local file if during development of a new release.
DREAMBANK_VERSION = "v1.0.0-alpha5"
dreams_kwargs = {
"url": f"https://github.com/remrama/dreambank/releases/download/{DREAMBANK_VERSION}/dreams.csv.xz",
"known_hash": "md5:2dcab92f9d9515df174388babb5c9e5a",
"progressbar": True,
}
dreams_fname = pooch.retrieve(**dreams_kwargs)
# dreams_fname = "../output/dreams.csv.xz"
# Retrieve the latest datasets from GitHub releases
# OR from local file if during development of a new release.
DREAMBANK_VERSION = "v1.0.0-alpha5"
dreams_kwargs = {
"url": f"https://github.com/remrama/dreambank/releases/download/{DREAMBANK_VERSION}/dreams.csv.xz",
"known_hash": "md5:2dcab92f9d9515df174388babb5c9e5a",
"progressbar": True,
}
dreams_fname = pooch.retrieve(**dreams_kwargs)
# dreams_fname = "../output/dreams.csv.xz"
In [4]:
Copied!
dreams = pd.read_csv(dreams_fname)
dreams = pd.read_csv(dreams_fname)
Process¶
In [5]:
Copied!
dreams = dreams[dreams["dataset"].isin(DATASETS)].copy()
dreams = dreams[dreams["dataset"].isin(DATASETS)].copy()
In [6]:
Copied!
# Verify assumptions about the data
assert dreams["metadata"].str.match(r"^[12][8901234]/female$").all()
assert dreams["dream_id"].str.match(r"^[1-9][0-9]?[a-b]?$").all()
# Verify that the age and sex metadata are consistent for the a/b person
dreams[dreams["dream_id"].str.len() > 2]
# Verify assumptions about the data
assert dreams["metadata"].str.match(r"^[12][8901234]/female$").all()
assert dreams["dream_id"].str.match(r"^[1-9][0-9]?[a-b]?$").all()
# Verify that the age and sex metadata are consistent for the a/b person
dreams[dreams["dream_id"].str.len() > 2]
Out[6]:
| dataset | dream_id | metadata | word_count | dream_text | |
|---|---|---|---|---|---|
| 37182 | ucsc_women | 16a | 20/female | 124 | In part of the dream I was arguing with my now... |
| 37183 | ucsc_women | 16b | 20/female | 60 | I dreamt that my ex-boyfriend forgave me for b... |
In [7]:
Copied!
# Create author column by extracting digits from dream_id (keep as string)
dreams["author"] = dreams["dream_id"].str.extract(r"^([1-9][0-9]?)[a-b]?$")[0].astype(str)
# Create age column by extracting from metadata
dreams["age"] = dreams["metadata"].str.extract(r"^([12][8901234])/female$")[0].astype(int)
# Create constant sex column
dreams["sex"] = "female"
# Create author column by extracting digits from dream_id (keep as string)
dreams["author"] = dreams["dream_id"].str.extract(r"^([1-9][0-9]?)[a-b]?$")[0].astype(str)
# Create age column by extracting from metadata
dreams["age"] = dreams["metadata"].str.extract(r"^([12][8901234])/female$")[0].astype(int)
# Create constant sex column
dreams["sex"] = "female"
In [8]:
Copied!
dreams = dreams.rename(columns={"dream_text": "report"})
dreams = dreams.reindex(columns=["author", "age", "sex", "report"])
dreams = dreams.sort_values("author")
dreams = dreams.rename(columns={"dream_text": "report"})
dreams = dreams.reindex(columns=["author", "age", "sex", "report"])
dreams = dreams.sort_values("author")
In [9]:
Copied!
dreams.head()
dreams.head()
Out[9]:
| author | age | sex | report | |
|---|---|---|---|---|
| 37168 | 1 | 18 | female | I had this dream at the beginning of the schoo... |
| 37177 | 10 | 19 | female | It was my first day of class. It was a small c... |
| 37178 | 12 | 19 | female | I am at a place with buildings surroundings, o... |
| 37179 | 13 | 19 | female | I was watching TV with my boyfriend. I remembe... |
| 37180 | 14 | 19 | female | I was in a concert hall of my favorite singer,... |
In [10]:
Copied!
dreams.describe(include="number")
dreams.describe(include="number")
Out[10]:
| age | |
|---|---|
| count | 81.000000 |
| mean | 20.617284 |
| std | 1.280311 |
| min | 18.000000 |
| 25% | 20.000000 |
| 50% | 21.000000 |
| 75% | 21.000000 |
| max | 24.000000 |
In [11]:
Copied!
dreams.describe(exclude="number")
dreams.describe(exclude="number")
Out[11]:
| author | sex | report | |
|---|---|---|---|
| count | 81 | 81 | 81 |
| unique | 80 | 1 | 81 |
| top | 16 | female | I had this dream at the beginning of the schoo... |
| freq | 2 | 81 | 1 |
In [12]:
Copied!
dreams.info(memory_usage="deep")
dreams.info(memory_usage="deep")
<class 'pandas.core.frame.DataFrame'> Index: 81 entries, 37168 to 37176 Data columns (total 4 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 author 81 non-null object 1 age 81 non-null int64 2 sex 81 non-null object 3 report 81 non-null object dtypes: int64(1), object(3) memory usage: 67.3 KB
Export¶
In [13]:
Copied!
assert not dreams.isna().any(axis=None)
assert not dreams.duplicated().any()
assert not dreams.duplicated(subset="report").any()
assert dreams["sex"].eq("female").all()
assert dreams["age"].between(18, 24).all()
assert (dreams["report"] == dreams["report"].str.strip()).all()
assert not dreams.isna().any(axis=None)
assert not dreams.duplicated().any()
assert not dreams.duplicated(subset="report").any()
assert dreams["sex"].eq("female").all()
assert dreams["age"].between(18, 24).all()
assert (dreams["report"] == dreams["report"].str.strip()).all()
In [14]:
Copied!
outpath = "./output/ucsc1996.csv"
os.makedirs(os.path.dirname(outpath), exist_ok=True)
TO_CSV_KWARGS = {
"index": False,
"na_rep": "N/A",
"sep": ",",
"mode": "x", # Switch to `w` to overwrite existing file
"encoding": "utf-8-sig", # Include sig/BOM for better compatibility with Excel
"lineterminator": "\n",
"quoting": 2, # 2 = csv.QUOTE_NONNUMERIC
"quotechar": '"',
"doublequote": True,
}
dreams.to_csv(outpath, **TO_CSV_KWARGS)
print(f"file: {os.path.basename(outpath)}")
print(f"size: {os.path.getsize(outpath) / 1e6} MB")
print(f"md5: {pooch.file_hash(outpath, alg='md5')}")
print(f"sha256: {pooch.file_hash(outpath, alg='sha256')}")
print(f"timestamp: {datetime.fromtimestamp(os.path.getmtime(outpath), tz=timezone.utc).isoformat(timespec='seconds')}")
outpath = "./output/ucsc1996.csv"
os.makedirs(os.path.dirname(outpath), exist_ok=True)
TO_CSV_KWARGS = {
"index": False,
"na_rep": "N/A",
"sep": ",",
"mode": "x", # Switch to `w` to overwrite existing file
"encoding": "utf-8-sig", # Include sig/BOM for better compatibility with Excel
"lineterminator": "\n",
"quoting": 2, # 2 = csv.QUOTE_NONNUMERIC
"quotechar": '"',
"doublequote": True,
}
dreams.to_csv(outpath, **TO_CSV_KWARGS)
print(f"file: {os.path.basename(outpath)}")
print(f"size: {os.path.getsize(outpath) / 1e6} MB")
print(f"md5: {pooch.file_hash(outpath, alg='md5')}")
print(f"sha256: {pooch.file_hash(outpath, alg='sha256')}")
print(f"timestamp: {datetime.fromtimestamp(os.path.getmtime(outpath), tz=timezone.utc).isoformat(timespec='seconds')}")
file: ucsc1996.csv size: 0.056766 MB md5: caf94dc6bac916330caeb200e60c5671 sha256: cc10c7e80e99039928bfe29bbb825dd1b3737cc994dcf9eba4650e1b1daa8c15 timestamp: 2025-12-30T18:35:09+00:00