Skip to content
This repository was archived by the owner on Nov 5, 2025. It is now read-only.

Commit 40a3680

Browse files
authored
Add command to reset contract documents (#177)
* Add command to reset contract documents * Handle missing data from contractors * Keep track of last processed contract * Skip doc processing * Add aux redis cache * Cache forever * Handle null exempt_id Co-authored-by: José Padilla <[email protected]>
1 parent b624775 commit 40a3680

File tree

8 files changed

+160
-12
lines changed

8 files changed

+160
-12
lines changed

contratospr/contracts/admin.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,13 +68,23 @@ class ContractorAdmin(admin.ModelAdmin):
6868
search_fields = ["name", "source_id"]
6969

7070

71+
class ContractInline(admin.StackedInline):
72+
model = Contract
73+
exclude = ["search_vector"]
74+
raw_id_fields = ["entity", "service", "document", "contractors", "parent"]
75+
can_delete = False
76+
extra = 0
77+
max_num = 1
78+
79+
7180
@admin.register(Document)
7281
class DocumentAdmin(admin.ModelAdmin):
7382
list_display = ["source_id", "file", "has_text", "created_at", "modified_at"]
7483
exclude = ["pages"]
7584
search_fields = ["source_id"]
7685
actions = ["download_source", "detect_text"]
7786
list_filter = [DocumentFileListFilter]
87+
inlines = [ContractInline]
7888

7989
def has_text(self, obj):
8090
return bool(obj.pages)
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
from django.core.management.base import BaseCommand
2+
3+
from ...models import Document
4+
from ...search import index_contract
5+
6+
7+
class Command(BaseCommand):
8+
help = "Delete all documents in S3, clears extracted pages, and reindex contracts."
9+
10+
def handle(self, *args, **options):
11+
documents = Document.objects.exclude(file="").defer("pages")
12+
13+
for document in documents:
14+
self.stdout.write(f"=> Resetting document {document.pk}")
15+
16+
self.stdout.write("==> Deleting file")
17+
document.file.delete()
18+
19+
self.stdout.write("==> Clearing pages")
20+
document.pages = None
21+
document.save(update_fields=["pages"])
22+
23+
self.stdout.write("==> Indexing contracts")
24+
for contract in document.contract_set.all():
25+
self.stdout.write(f"===> Indexing contract {contract.pk}")
26+
index_contract(contract)
Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
from django.core.cache import caches
2+
from django.core.management.base import BaseCommand
3+
4+
from ...models import Contract
5+
from ...tasks import scrape_contracts
6+
7+
8+
class Command(BaseCommand):
9+
help = "Update local contracts with data from remote source"
10+
11+
def add_arguments(self, parser):
12+
parser.add_argument("--limit", nargs="?", type=int, default=1000)
13+
14+
def handle(self, *args, **options):
15+
try:
16+
cache = caches["aux"]
17+
except Exception:
18+
cache = caches["default"]
19+
20+
limit = options.get("limit")
21+
22+
contracts = (
23+
Contract.objects.select_related("entity")
24+
.all()
25+
.order_by("pk")
26+
.only("pk", "number", "entity")
27+
)
28+
29+
cur_offset = 0
30+
cur_limit = 1
31+
32+
cache_key = "cmd:update_contracts:last_preview_id:limit={}".format(limit)
33+
last_contract_id = cache.get(cache_key)
34+
35+
if last_contract_id:
36+
self.stdout.write("=> Starting after {}".format(last_contract_id))
37+
contracts = contracts.filter(pk__gt=last_contract_id)
38+
39+
while cur_limit <= limit:
40+
cur_c = list(contracts[cur_offset:cur_limit].iterator())
41+
42+
if len(cur_c) > 0:
43+
c = cur_c[0]
44+
last_contract_id = c.pk
45+
46+
self.stdout.write(f"=> Scraping contract {c.pk} / {c.number}")
47+
scrape_contracts(
48+
skip_doc_tasks=True,
49+
contract_number=c.number,
50+
entity_id=c.entity.source_id,
51+
)
52+
53+
self.stdout.write("=> Last contract {}".format(last_contract_id))
54+
cache.set(cache_key, last_contract_id, timeout=None)
55+
else:
56+
self.stdout.write(
57+
"=> Nothing found offset={} / limit={} / last_contract_id={}".format(
58+
cur_offset, cur_limit, last_contract_id
59+
)
60+
)
61+
break
62+
63+
cur_offset += 1
64+
cur_limit += 1
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
# Generated by Django 3.1.12 on 2021-09-12 23:02
2+
3+
from django.db import migrations, models
4+
5+
6+
class Migration(migrations.Migration):
7+
8+
dependencies = [("contracts", "0006_auto_20201212_2129")]
9+
10+
operations = [
11+
migrations.AlterField(
12+
model_name="document",
13+
name="source_id",
14+
field=models.CharField(max_length=255, unique=True),
15+
)
16+
]
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
# Generated by Django 3.1.12 on 2021-12-18 16:11
2+
3+
from django.db import migrations, models
4+
5+
6+
class Migration(migrations.Migration):
7+
8+
dependencies = [("contracts", "0007_auto_20210912_2302")]
9+
10+
operations = [
11+
migrations.AlterField(
12+
model_name="contract",
13+
name="exempt_id",
14+
field=models.CharField(blank=True, max_length=255),
15+
)
16+
]

contratospr/contracts/models.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ def __str__(self):
6868

6969

7070
class Document(BaseModel):
71-
source_id = models.PositiveIntegerField(unique=True)
71+
source_id = models.CharField(max_length=255, unique=True)
7272
source_url = models.URLField()
7373
file = models.FileField(
7474
blank=True, null=True, upload_to=document_file_path, storage=document_storage
@@ -126,7 +126,7 @@ class Contract(BaseModel):
126126
amount_to_pay = models.DecimalField(max_digits=20, decimal_places=2)
127127
has_amendments = models.BooleanField()
128128
document = models.ForeignKey("Document", null=True, on_delete=models.SET_NULL)
129-
exempt_id = models.CharField(max_length=255)
129+
exempt_id = models.CharField(max_length=255, blank=True)
130130
contractors = models.ManyToManyField("Contractor")
131131
parent = models.ForeignKey(
132132
"self", null=True, on_delete=models.CASCADE, related_name="amendments"

contratospr/contracts/tasks.py

Lines changed: 15 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ def normalize_contract(contract):
6565
document_id = result["document_id"]
6666
result[
6767
"document_url"
68-
] = f"{BASE_CONTRACT_URL}/downloaddocument?documentid={document_id}"
68+
] = f"{BASE_CONTRACT_URL}/downloaddocument?code={document_id}"
6969

7070
return result
7171

@@ -78,7 +78,9 @@ def normalize_contractors(contractors):
7878
{
7979
"contractor_id": contractor["ContractorId"],
8080
"entity_id": contractor["EntityId"],
81-
"name": contractor["Name"],
81+
"name": contractor["Name"]
82+
or contractor["ConfirmedName1"]
83+
or contractor["ConfirmedName2"],
8284
}
8385
)
8486

@@ -138,7 +140,7 @@ def request_contract_document(contract_id):
138140

139141

140142
@app.task
141-
def update_contract(result, parent_id=None):
143+
def update_contract(result, parent_id=None, skip_doc_tasks=False):
142144
logger.info(
143145
"Updating contract", contract=result["contract_number"], parent_id=parent_id
144146
)
@@ -171,7 +173,7 @@ def update_contract(result, parent_id=None):
171173
"cancellation_date": result["cancellation_date"],
172174
"amount_to_pay": result["amount_to_pay"],
173175
"has_amendments": result["has_amendments"],
174-
"exempt_id": result["exempt_id"],
176+
"exempt_id": result["exempt_id"] or "",
175177
"parent_id": parent_id,
176178
}
177179

@@ -181,7 +183,7 @@ def update_contract(result, parent_id=None):
181183
defaults={"source_url": result["document_url"]},
182184
)
183185

184-
if document_created:
186+
if document_created and skip_doc_tasks:
185187
chain(download_document.si(document.pk), detect_text.si(document.pk))()
186188

187189
artifacts.append({"obj": document, "created": document_created})
@@ -195,7 +197,7 @@ def update_contract(result, parent_id=None):
195197

196198
for contractor_result in result["contractors"]:
197199
contractor, contractor_created = Contractor.objects.get_or_create(
198-
source_id=contractor_result["contractor_id"],
200+
source_id=contractor_result["contractor_id"] or contract.source_id,
199201
defaults={
200202
"name": contractor_result["name"],
201203
"entity_id": contractor_result["entity_id"],
@@ -207,10 +209,13 @@ def update_contract(result, parent_id=None):
207209
contract.contractors.add(contractor)
208210

209211
for amendment_result in result["amendments"]:
210-
amendment_artifacts = update_contract(amendment_result, parent_id=contract.pk)
212+
amendment_artifacts = update_contract(
213+
amendment_result, parent_id=contract.pk, skip_doc_tasks=skip_doc_tasks
214+
)
211215
artifacts.extend(amendment_artifacts)
212216

213-
index_contract(contract)
217+
if not skip_doc_tasks:
218+
index_contract(contract)
214219

215220
return artifacts
216221

@@ -221,6 +226,7 @@ def scrape_contracts(limit=None, max_items=None, **kwargs):
221226
total_records = 0
222227
default_limit = 10
223228
real_limit = limit or default_limit
229+
skip_doc_tasks = kwargs.pop("skip_doc_tasks", False)
224230
collection_job_id = kwargs.pop("collection_job_id", None)
225231
collection_job = None
226232
if collection_job_id:
@@ -242,7 +248,7 @@ def scrape_contracts(limit=None, max_items=None, **kwargs):
242248

243249
for contract in contracts["data"]:
244250
expanded = expand_contract(contract)
245-
results = update_contract(expanded)
251+
results = update_contract(expanded, skip_doc_tasks=skip_doc_tasks)
246252

247253
if collection_job:
248254
collection_job.create_artifacts(results)

contratospr/settings.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,7 @@ class Common(Configuration):
151151
AUTH_USER_MODEL = "users.User"
152152

153153
REDIS_URL = values.Value(environ_prefix=None)
154+
AUX_REDIS_URL = values.Value(environ_prefix=None)
154155

155156
DEBUG_TOOLBAR_CONFIG = {
156157
"SHOW_TOOLBAR_CALLBACK": "contratospr.utils.debug_toolbar.show_toolbar"
@@ -229,14 +230,23 @@ class Production(Common):
229230

230231
@property
231232
def CACHES(self):
232-
return {
233+
caches = {
233234
"default": {
234235
"BACKEND": "django_redis.cache.RedisCache",
235236
"LOCATION": f"{self.REDIS_URL}/1",
236237
"OPTIONS": {"CLIENT_CLASS": "django_redis.client.DefaultClient"},
237238
}
238239
}
239240

241+
if self.AUX_REDIS_URL:
242+
caches["aux"] = {
243+
"BACKEND": "django_redis.cache.RedisCache",
244+
"LOCATION": f"{self.AUX_REDIS_URL}/1",
245+
"OPTIONS": {"CLIENT_CLASS": "django_redis.client.DefaultClient"},
246+
}
247+
248+
return caches
249+
240250

241251
class Testing(Common):
242252
PASSWORD_HASHERS = ["django.contrib.auth.hashers.MD5PasswordHasher"]

0 commit comments

Comments
 (0)