urbina1975¶
This dataset comes from DreamBank. It merges the Peruvian men and Peruvian women datasets from an archived copy of DreamBank hosted on Zenodo. See the krank GitHub Issue and the urbina1975 docs page for more info.
Setup¶
In [1]:
Copied!
from datetime import datetime, timezone
import os
import pandas as pd
import pooch
from datetime import datetime, timezone
import os
import pandas as pd
import pooch
In [2]:
Copied!
DATASETS = ["peru-f", "peru-m"]
DATASETS = ["peru-f", "peru-m"]
Load¶
In [3]:
Copied!
# Retrieve the latest datasets from GitHub releases
# OR from local file if during development of a new release.
DREAMBANK_VERSION = "v1.0.0-alpha5"
datasets_kwargs = {
"url": f"https://github.com/remrama/dreambank/releases/download/{DREAMBANK_VERSION}/datasets.csv.xz",
"known_hash": "md5:1475582e2daa1da53920df50cb9fc98e",
}
dreams_kwargs = {
"url": f"https://github.com/remrama/dreambank/releases/download/{DREAMBANK_VERSION}/dreams.csv.xz",
"known_hash": "md5:2dcab92f9d9515df174388babb5c9e5a",
"progressbar": True,
}
datasets_fname = pooch.retrieve(**datasets_kwargs)
dreams_fname = pooch.retrieve(**dreams_kwargs)
# datasets_fname = "../output/datasets.csv.xz"
# dreams_fname = "../output/dreams.csv.xz"
# Retrieve the latest datasets from GitHub releases
# OR from local file if during development of a new release.
DREAMBANK_VERSION = "v1.0.0-alpha5"
datasets_kwargs = {
"url": f"https://github.com/remrama/dreambank/releases/download/{DREAMBANK_VERSION}/datasets.csv.xz",
"known_hash": "md5:1475582e2daa1da53920df50cb9fc98e",
}
dreams_kwargs = {
"url": f"https://github.com/remrama/dreambank/releases/download/{DREAMBANK_VERSION}/dreams.csv.xz",
"known_hash": "md5:2dcab92f9d9515df174388babb5c9e5a",
"progressbar": True,
}
datasets_fname = pooch.retrieve(**datasets_kwargs)
dreams_fname = pooch.retrieve(**dreams_kwargs)
# datasets_fname = "../output/datasets.csv.xz"
# dreams_fname = "../output/dreams.csv.xz"
In [4]:
Copied!
datasets = pd.read_csv(datasets_fname)
dreams = pd.read_csv(dreams_fname)
datasets = pd.read_csv(datasets_fname)
dreams = pd.read_csv(dreams_fname)
Process¶
In [5]:
Copied!
dreams = dreams[dreams["dataset"].isin(DATASETS)].copy()
dreams = dreams[dreams["dataset"].isin(DATASETS)].copy()
In [6]:
Copied!
assert dreams["metadata"].isna().all()
assert dreams["dream_id"].str.match(r"^[FM][0-4][0-9]-[1-8]$").all()
assert dreams["dataset"].str[-1].eq(dreams["dream_id"].str[0].str.lower()).all()
assert dreams["metadata"].isna().all()
assert dreams["dream_id"].str.match(r"^[FM][0-4][0-9]-[1-8]$").all()
assert dreams["dataset"].str[-1].eq(dreams["dream_id"].str[0].str.lower()).all()
In [7]:
Copied!
dreams["sex"] = dreams["dataset"].str[-1].map({"f": "female", "m": "male"})
dreams["author"] = dreams["dream_id"].str[:3]
dreams["sex"] = dreams["dataset"].str[-1].map({"f": "female", "m": "male"})
dreams["author"] = dreams["dream_id"].str[:3]
In [8]:
Copied!
dreams = dreams.rename(columns={"dream_text": "report"})
dreams = dreams.reindex(columns=["author", "sex", "report"])
dreams = dreams.sort_values("author")
dreams = dreams.rename(columns={"dream_text": "report"})
dreams = dreams.reindex(columns=["author", "sex", "report"])
dreams = dreams.sort_values("author")
In [9]:
Copied!
# Minor fix: See https://github.com/remrama/dreambank/issues/6
dreams["report"] = dreams["report"].str.replace(r"\\$", "", regex=True)
# Minor fix: See https://github.com/remrama/dreambank/issues/6
dreams["report"] = dreams["report"].str.replace(r"\\$", "", regex=True)
In [10]:
Copied!
dreams.head()
dreams.head()
Out[10]:
| author | sex | report | |
|---|---|---|---|
| 35127 | F01 | female | I dreamed I was a the beach with my sister, su... |
| 35128 | F01 | female | I dreamed I was married to a very tall and fat... |
| 35129 | F01 | female | I dreamed about an unknown person who had a sn... |
| 35130 | F01 | female | I dreamed about a friend and we were going by ... |
| 35131 | F01 | female | I dreamed I was at a beach and then a group of... |
In [11]:
Copied!
dreams.describe()
dreams.describe()
Out[11]:
| author | sex | report | |
|---|---|---|---|
| count | 766 | 766 | 766 |
| unique | 96 | 2 | 766 |
| top | F01 | male | I dreamed I was a the beach with my sister, su... |
| freq | 8 | 384 | 1 |
In [12]:
Copied!
dreams.info(memory_usage="deep")
dreams.info(memory_usage="deep")
<class 'pandas.core.frame.DataFrame'> Index: 766 entries, 35127 to 35892 Data columns (total 3 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 author 766 non-null object 1 sex 766 non-null object 2 report 766 non-null object dtypes: object(3) memory usage: 603.5 KB
Export¶
In [13]:
Copied!
assert not dreams.isna().any(axis=None)
assert not dreams.duplicated().any()
assert not dreams.duplicated(subset="report").any()
assert dreams["sex"].isin(["female", "male"]).all()
assert (dreams["report"] == dreams["report"].str.strip()).all()
assert not dreams.isna().any(axis=None)
assert not dreams.duplicated().any()
assert not dreams.duplicated(subset="report").any()
assert dreams["sex"].isin(["female", "male"]).all()
assert (dreams["report"] == dreams["report"].str.strip()).all()
In [14]:
Copied!
outpath = "./output/urbina1975.csv"
os.makedirs(os.path.dirname(outpath), exist_ok=True)
TO_CSV_KWARGS = {
"index": False,
"na_rep": "N/A",
"sep": ",",
"mode": "x", # Switch to `w` to overwrite existing file
"encoding": "utf-8-sig", # Include sig/BOM for better compatibility with Excel
"lineterminator": "\n",
"quoting": 2, # 2 = csv.QUOTE_NONNUMERIC
"quotechar": '"',
"doublequote": True,
}
dreams.to_csv(outpath, **TO_CSV_KWARGS)
print(f"file: {os.path.basename(outpath)}")
print(f"size: {os.path.getsize(outpath) / 1e6} MB")
print(f"md5: {pooch.file_hash(outpath, alg='md5')}")
print(f"sha256: {pooch.file_hash(outpath, alg='sha256')}")
print(f"timestamp: {datetime.fromtimestamp(os.path.getmtime(outpath), tz=timezone.utc).isoformat(timespec='seconds')}")
outpath = "./output/urbina1975.csv"
os.makedirs(os.path.dirname(outpath), exist_ok=True)
TO_CSV_KWARGS = {
"index": False,
"na_rep": "N/A",
"sep": ",",
"mode": "x", # Switch to `w` to overwrite existing file
"encoding": "utf-8-sig", # Include sig/BOM for better compatibility with Excel
"lineterminator": "\n",
"quoting": 2, # 2 = csv.QUOTE_NONNUMERIC
"quotechar": '"',
"doublequote": True,
}
dreams.to_csv(outpath, **TO_CSV_KWARGS)
print(f"file: {os.path.basename(outpath)}")
print(f"size: {os.path.getsize(outpath) / 1e6} MB")
print(f"md5: {pooch.file_hash(outpath, alg='md5')}")
print(f"sha256: {pooch.file_hash(outpath, alg='sha256')}")
print(f"timestamp: {datetime.fromtimestamp(os.path.getmtime(outpath), tz=timezone.utc).isoformat(timespec='seconds')}")
file: urbina1975.csv size: 0.506346 MB md5: 05197362b787f8bef2e7c11fb9b81bf1 sha256: 4367e9c0b09176b920cb241c1658323eae68a7f51b5df8e4dce2fecffc343b3a timestamp: 2025-12-30T04:11:42+00:00