Skip to content

Commit f201b84

Browse files
authored
Merge branch 'main' into daar-op-1
2 parents b300a61 + 93a9a15 commit f201b84

410 files changed

Lines changed: 23159 additions & 18483 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.coveragerc

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,9 @@
11
[run]
2+
dynamic_context = test_function
3+
concurrency = multiprocessing
4+
sigterm = True
5+
parallel = True
6+
27
omit =
38
# avoid measuring strange non-existing files
49
/workspace/config.py

.github/workflows/publish-pypi.yml

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -26,13 +26,18 @@ jobs:
2626
- name: Set up Python
2727
uses: actions/setup-python@v5
2828
with:
29-
python-version: '3.x'
30-
- name: Install dependencies
31-
run: |
32-
python -m pip install --upgrade pip
33-
pip install setuptools
29+
python-version: '3.10'
30+
- name: Install Hatch
31+
uses: pypa/hatch@install
3432
- name: Build package
35-
run: python setup.py sdist bdist_wheel
33+
run: hatch build
34+
- name: Install Wheel and Run Tests
35+
run: |
36+
python -m venv venv
37+
source venv/bin/activate
38+
pip install dist/*.whl
39+
python -c "import data_juicer; print(data_juicer.__version__)"
40+
dj-process --help
3641
- name: Publish package
3742
uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29
3843
with:

.github/workflows/unit-test-partial.yml

Lines changed: 71 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,6 @@ name: unittest-partial
66
on:
77
workflow_dispatch:
88
pull_request:
9-
push:
10-
branches:
11-
- main
129

1310
permissions:
1411
contents: read
@@ -36,6 +33,12 @@ jobs:
3633
run: |
3734
docker compose exec ray-head bash -c '/root/.local/bin/uv pip install --system -e .\[all\]'
3835
36+
- name: Print Pip Dependency Tree
37+
working-directory: dj-${{ github.run_id }}/.github/workflows/docker
38+
run: |
39+
docker compose exec ray-head bash -c '/root/.local/bin/uv pip install --system pipdeptree'
40+
docker compose exec ray-head bash -c 'pipdeptree'
41+
3942
- name: Clean dataset cache
4043
working-directory: dj-${{ github.run_id }}/.github/workflows/docker
4144
run: |
@@ -44,14 +47,20 @@ jobs:
4447
- name: Run unittest standalone
4548
working-directory: dj-${{ github.run_id }}/.github/workflows/docker
4649
run: |
47-
docker compose exec -e OPENAI_BASE_URL=${{ secrets.OPENAI_BASE_URL }} -e OPENAI_API_KEY=${{ secrets.OPENAI_API_KEY }} ray-head bash -c 'python tests/run.py --tag standalone --mode partial'
50+
if [ "${{ github.event.pull_request.head.repo.full_name }}" != "${{ github.repository }}" ]; then
51+
docker compose exec -e OPENAI_BASE_URL=${{ secrets.OPENAI_BASE_URL }} -e OPENAI_API_KEY=${{ secrets.OPENAI_API_KEY }} ray-head bash -c 'python tests/run.py --tag standalone --mode partial --from-fork True'
52+
else
53+
docker compose exec -e OPENAI_BASE_URL=${{ secrets.OPENAI_BASE_URL }} -e OPENAI_API_KEY=${{ secrets.OPENAI_API_KEY }} ray-head bash -c 'python tests/run.py --tag standalone --mode partial'
54+
fi
55+
docker compose exec ray-head bash -c 'coverage combine'
56+
docker compose exec ray-head bash -c 'mv .coverage .coverage.standalone'
4857
4958
- name: Upload coverage report of standalone
5059
uses: actions/upload-artifact@v4
5160
with:
5261
name: coverage_report_standalone
5362
include-hidden-files: true
54-
path: dj-${{ github.run_id }}/.coverage
63+
path: dj-${{ github.run_id }}/.coverage.standalone
5564

5665
- name: Remove docker compose
5766
working-directory: dj-${{ github.run_id }}/.github/workflows/docker
@@ -93,12 +102,69 @@ jobs:
93102
working-directory: dj-${{ github.run_id }}/.github/workflows/docker
94103
run: |
95104
docker compose exec ray-head bash -c 'python tests/run.py --tag ray --mode partial'
105+
docker compose exec ray-head bash -c 'coverage combine'
106+
docker compose exec ray-head bash -c 'mv .coverage .coverage.ray'
96107
97108
- name: Upload coverage report of ray
98109
uses: actions/upload-artifact@v4
99110
with:
100111
name: coverage_report_ray
101112
include-hidden-files: true
113+
path: dj-${{ github.run_id }}/.coverage.ray
114+
115+
- name: Remove docker compose
116+
working-directory: dj-${{ github.run_id }}/.github/workflows/docker
117+
if: always()
118+
run: |
119+
docker compose down --remove-orphans
120+
121+
- name: Cleanup workspace
122+
if: always()
123+
run: |
124+
rm -rf dj-${{ github.run_id }}
125+
126+
combine-coverage:
127+
needs: [unittest-single, unittest-dist]
128+
runs-on: [GPU, unittest]
129+
steps:
130+
- uses: actions/checkout@v3
131+
with:
132+
path: dj-${{ github.run_id }}
133+
fetch-depth: 0
134+
135+
- name: Setup docker compose
136+
working-directory: dj-${{ github.run_id }}/.github/workflows/docker
137+
run: |
138+
docker compose up -d
139+
140+
- name: Install coverage
141+
working-directory: dj-${{ github.run_id }}/.github/workflows/docker
142+
run: |
143+
docker compose exec ray-head bash -c '/root/.local/bin/uv pip install --system coverage'
144+
145+
- name: Download Coverage Report Standalone
146+
uses: actions/download-artifact@v4
147+
with:
148+
name: coverage_report_standalone
149+
path: dj-${{ github.run_id }}
150+
151+
- name: Download Coverage Report Ray
152+
uses: actions/download-artifact@v4
153+
with:
154+
name: coverage_report_ray
155+
path: dj-${{ github.run_id }}
156+
157+
- name: Combine Coverage Reports
158+
working-directory: dj-${{ github.run_id }}/.github/workflows/docker
159+
run: |
160+
docker compose exec ray-head bash -c 'coverage combine'
161+
docker compose exec ray-head bash -c 'coverage report -m'
162+
163+
- name: Upload Overall Coverage Report
164+
uses: actions/upload-artifact@v4
165+
with:
166+
name: coverage_report_all
167+
include-hidden-files: true
102168
path: dj-${{ github.run_id }}/.coverage
103169

104170
- name: Remove docker compose

.github/workflows/unit-test.yml

Lines changed: 76 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,6 @@ jobs:
1919
unittest-single:
2020
runs-on: [GPU, unittest]
2121
environment: Testing
22-
outputs:
23-
total: ${{ steps.total.outputs.total }}
2422
steps:
2523
- uses: actions/checkout@v3
2624
with:
@@ -37,6 +35,12 @@ jobs:
3735
docker compose exec ray-head bash -c '/root/.local/bin/uv pip install --system -e .\[all\]'
3836
docker compose exec ray-worker bash -c '/root/.local/bin/uv pip install --system -e .\[all\]'
3937
38+
- name: Print Pip Dependency Tree
39+
working-directory: dj-${{ github.run_id }}/.github/workflows/docker
40+
run: |
41+
docker compose exec ray-head bash -c '/root/.local/bin/uv pip install --system pipdeptree'
42+
docker compose exec ray-head bash -c 'pipdeptree'
43+
4044
- name: Clean dataset cache
4145
working-directory: dj-${{ github.run_id }}/.github/workflows/docker
4246
run: |
@@ -46,19 +50,15 @@ jobs:
4650
working-directory: dj-${{ github.run_id }}/.github/workflows/docker
4751
run: |
4852
docker compose exec -e OPENAI_BASE_URL=${{ secrets.OPENAI_BASE_URL }} -e OPENAI_API_KEY=${{ secrets.OPENAI_API_KEY }} ray-head bash -c 'python tests/run.py --tag standalone --mode regression'
53+
docker compose exec ray-head bash -c 'coverage combine'
54+
docker compose exec ray-head bash -c 'mv .coverage .coverage.standalone'
4955
5056
- name: Upload coverage report of standalone
5157
uses: actions/upload-artifact@v4
5258
with:
5359
name: coverage_report_standalone
5460
include-hidden-files: true
55-
path: dj-${{ github.run_id }}/.coverage
56-
57-
- name: Get Total Coverage Result
58-
id: total
59-
working-directory: dj-${{ github.run_id }}/.github/workflows/docker
60-
run: |
61-
echo "total=$(docker compose exec ray-head coverage report --format=total)" >> $GITHUB_OUTPUT
61+
path: dj-${{ github.run_id }}/.coverage.standalone
6262

6363
- name: Remove docker compose
6464
working-directory: dj-${{ github.run_id }}/.github/workflows/docker
@@ -99,14 +99,79 @@ jobs:
9999
working-directory: dj-${{ github.run_id }}/.github/workflows/docker
100100
run: |
101101
docker compose exec ray-head bash -c 'python tests/run.py --tag ray --mode regression'
102+
docker compose exec ray-head bash -c 'coverage combine'
103+
docker compose exec ray-head bash -c 'mv .coverage .coverage.ray'
102104
103105
- name: Upload coverage report of ray
104106
uses: actions/upload-artifact@v4
105107
with:
106108
name: coverage_report_ray
107109
include-hidden-files: true
110+
path: dj-${{ github.run_id }}/.coverage.ray
111+
112+
- name: Remove docker compose
113+
working-directory: dj-${{ github.run_id }}/.github/workflows/docker
114+
if: always()
115+
run: |
116+
docker compose down --remove-orphans
117+
118+
- name: Cleanup workspace
119+
if: always()
120+
run: |
121+
rm -rf dj-${{ github.run_id }}
122+
123+
combine-coverage:
124+
needs: [unittest-single, unittest-dist]
125+
runs-on: [GPU, unittest]
126+
outputs:
127+
total: ${{ steps.total.outputs.total }}
128+
steps:
129+
- uses: actions/checkout@v3
130+
with:
131+
path: dj-${{ github.run_id }}
132+
fetch-depth: 0
133+
134+
- name: Setup docker compose
135+
working-directory: dj-${{ github.run_id }}/.github/workflows/docker
136+
run: |
137+
docker compose up -d
138+
139+
- name: Install coverage
140+
working-directory: dj-${{ github.run_id }}/.github/workflows/docker
141+
run: |
142+
docker compose exec ray-head bash -c '/root/.local/bin/uv pip install --system coverage'
143+
144+
- name: Download Coverage Report Standalone
145+
uses: actions/download-artifact@v4
146+
with:
147+
name: coverage_report_standalone
148+
path: dj-${{ github.run_id }}
149+
150+
- name: Download Coverage Report Ray
151+
uses: actions/download-artifact@v4
152+
with:
153+
name: coverage_report_ray
154+
path: dj-${{ github.run_id }}
155+
156+
- name: Combine Coverage Reports
157+
working-directory: dj-${{ github.run_id }}/.github/workflows/docker
158+
run: |
159+
docker compose exec ray-head bash -c 'coverage combine'
160+
docker compose exec ray-head bash -c 'coverage report -m'
161+
162+
- name: Upload Overall Coverage Report
163+
uses: actions/upload-artifact@v4
164+
with:
165+
name: coverage_report_all
166+
include-hidden-files: true
108167
path: dj-${{ github.run_id }}/.coverage
109168

169+
- name: Get Total Coverage Result
170+
id: total
171+
working-directory: dj-${{ github.run_id }}/.github/workflows/docker
172+
run: |
173+
echo "total=$(docker compose exec ray-head coverage report --format=total)" >> $GITHUB_OUTPUT
174+
110175
- name: Remove docker compose
111176
working-directory: dj-${{ github.run_id }}/.github/workflows/docker
112177
if: always()
@@ -120,7 +185,7 @@ jobs:
120185
121186
dynamic-badge:
122187
runs-on: ubuntu-latest
123-
needs: unittest-single
188+
needs: combine-coverage
124189
steps:
125190
- name: "Create badge"
126191
# https://gist.github.com/HYLcool/f856b14416f08f73d05d32fd992a9c29
@@ -130,7 +195,7 @@ jobs:
130195
gistID: f856b14416f08f73d05d32fd992a9c29
131196
filename: total_cov.json
132197
label: Coverage
133-
message: ${{ needs.unittest-single.outputs.total }}%
198+
message: ${{ needs.combine-coverage.outputs.total }}%
134199
minColorRange: 60
135200
maxColorRange: 90
136201
valColorRange: ${{ needs.unittest-single.outputs.total }}

.gitignore

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ dist
1515
wandb/
1616
__pycache__
1717
.vscode/
18+
.ipynb_checkpoints/
1819

1920
# label studio related
2021
label_studio_data/
@@ -30,5 +31,3 @@ tests/ops/data/*dup*
3031
tests/tools/tmp_*/
3132
tests/ops/deduplicator/chinese_dedup/
3233
tests/ops/deduplicator/english_dedup/
33-
34-
.ipynb_checkpoints/

.pre-commit-config.yaml

Lines changed: 10 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,12 @@
11
repos:
2-
- repo: https://github.com/PyCQA/flake8
3-
rev: 6.1.0
4-
hooks:
5-
- id: flake8
6-
additional_dependencies:
7-
- flake8-pyproject
8-
- pycodestyle<2.12.0
92
- repo: https://github.com/PyCQA/isort.git
103
rev: 5.12.0
114
hooks:
125
- id: isort
13-
- repo: https://github.com/pre-commit/mirrors-yapf
14-
rev: v0.32.0
6+
- repo: https://github.com/psf/black
7+
rev: 25.1.0
158
hooks:
16-
- id: yapf
17-
exclude: data_juicer/ops/common/special_characters.py
18-
additional_dependencies:
19-
- toml
9+
- id: black
2010
- repo: https://github.com/pre-commit/pre-commit-hooks
2111
rev: v5.0.0
2212
hooks:
@@ -28,8 +18,6 @@ repos:
2818
exclude: thirdparty/
2919
- id: requirements-txt-fixer
3020
exclude: thirdparty/
31-
- id: double-quote-string-fixer
32-
exclude: ^(thirdparty/|data_juicer/ops/common/special_characters.py)
3321
- id: check-merge-conflict
3422
exclude: thirdparty/
3523
- id: fix-encoding-pragma
@@ -38,6 +26,13 @@ repos:
3826
- id: mixed-line-ending
3927
exclude: thirdparty/
4028
args: [ "--fix=lf" ]
29+
- repo: https://github.com/PyCQA/flake8
30+
rev: 6.1.0
31+
hooks:
32+
- id: flake8
33+
additional_dependencies:
34+
- flake8-pyproject
35+
- flake8-black
4136
- repo: local
4237
hooks:
4338
- id: build-op-doc

0 commit comments

Comments
 (0)