diff --git a/README.md b/README.md index a6b28e1..5614a93 100644 --- a/README.md +++ b/README.md @@ -1,11 +1,39 @@ + --- -license: Apache License 2.0 +displayName: SAMSum Corpus +labelTypes: +- Classification +license: +- CC BY-NC-ND 4.0 +mediaTypes: +- Text +paperUrl: https://arxiv.org/pdf/1911.12237v2.pdf +publishDate: "2019" +publishUrl: https://github.com/huggingface/datasets/tree/master/datasets/samsum +publisher: +- Samsung R&D Institute Poland +tags: +- Text +taskTypes: +- Text Summarization/Simplication +- Federated Learning +- Abstractive Text Summarization + --- -数据集文件元信息以及数据文件,请浏览“数据集文件”页面获取。 - -当前数据集卡片使用的是默认模版,数据集的贡献者未提供更加详细的数据集介绍,但是您可以通过如下GIT Clone命令,或者ModelScope SDK来下载数据集 - -#### 下载方法 -:modelscope-code[]{type="sdk"} -:modelscope-code[]{type="git"} +# 数据集介绍 + ## 简介 + SAMSum 数据集包含大约 16k 个带有摘要的类似信使的对话。对话由精通英语的语言学家创建和记录。语言学家被要求创建类似于他们每天所写的对话,以反映他们现实生活中的信使对话的主题比例。风格和语域是多样化的——对话可以是非正式的、半正式的或正式的,它们可能包含俚语、表情符号和错别字。然后,用摘要对对话进行注释。假设摘要应该是人们在第三人称对话中所谈论内容的简明扼要。 SAMSum 数据集由波兰三星研发研究所准备并分发用于研究目的(非商业许可:CC BY-NC-ND 4.0)。 + ## 引文 + +``` +"@article{gliwa2019samsum, +title={SAMSum corpus: A human-annotated dialogue dataset for abstractive summarization}, +author={Gliwa, Bogdan and Mochol, Iwona and Biesek, Maciej and Wawer, Aleksander}, +journal={arXiv preprint arXiv:1911.12237}, +year={2019} +}" +``` + +## Download dataset +:modelscope-code[]{type="git"} \ No newline at end of file diff --git a/metafile.yaml b/metafile.yaml new file mode 100644 index 0000000..4485058 --- /dev/null +++ b/metafile.yaml @@ -0,0 +1,18 @@ +displayName: SAMSum Corpus +labelTypes: +- Classification +license: +- CC BY-NC-ND 4.0 +mediaTypes: +- Text +paperUrl: https://arxiv.org/pdf/1911.12237v2.pdf +publishDate: "2019" +publishUrl: https://github.com/huggingface/datasets/tree/master/datasets/samsum +publisher: +- Samsung R&D Institute Poland +tags: +- Text +taskTypes: +- Text Summarization/Simplication +- Federated Learning +- Abstractive Text Summarization diff --git a/quickstart.md b/quickstart.md new file mode 100644 index 0000000..572a8ca --- /dev/null +++ b/quickstart.md @@ -0,0 +1,9 @@ + +## SDK usage +```python +from modelscope.msdatasets import MsDataset + +MsDataset.load("OpenDataLab/SAMSum_Corpus") + +# Note: If the SDK is not available, please use git to download the dataset. +``` \ No newline at end of file diff --git a/raw/corpus.7z b/raw/corpus.7z new file mode 100644 index 0000000..9c8a510 --- /dev/null +++ b/raw/corpus.7z @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a97674c66726f66b98a08ca5e8868fb8af9d4843f2b05c4f839bc5cfe91e8899 +size 2944100 diff --git a/sample/other/test.json b/sample/other/test.json new file mode 100644 index 0000000..f1176d0 --- /dev/null +++ b/sample/other/test.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eb5ac232d04982fdf70766b10bd07223732d8de0b817a0ab2a30a79ea99a693a +size 590609 diff --git a/sample/other/train.json b/sample/other/train.json new file mode 100644 index 0000000..641d6b3 --- /dev/null +++ b/sample/other/train.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c0df0898ad4322462cb73959f0ca820b31c4dc27e44b1ef64fbc34f7e12f1cb5 +size 10484215 diff --git a/sample/other/val.json b/sample/other/val.json new file mode 100644 index 0000000..9c0c033 --- /dev/null +++ b/sample/other/val.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f291b34ea2a851c4fd67bc9e1c05003f0dde2c2ae1a501ba733b9d97b6166349 +size 571716