Skip to content

Commit

Permalink
* fix may concurrently insert duplicate ImageInReply entities by lo…
Browse files Browse the repository at this point in the history
…cking the global placeholder object of each image URL filename

* fix entities `ReplyContentImage` related with already locked image URL filename will not be inserted
* fix never release locked `SaverLocks` before returning
@ `Save()`

+ static field `LocksKeyByUrlFilename`
@ ReplyContentImageSaver.cs
@ c#/crawler
  • Loading branch information
n0099 committed May 17, 2024
1 parent 7b8a94b commit c129bf2
Showing 1 changed file with 24 additions and 3 deletions.
27 changes: 24 additions & 3 deletions c#/crawler/src/Tieba/Crawl/Saver/ReplyContentImageSaver.cs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@ namespace tbm.Crawler.Tieba.Crawl.Saver;

public class ReplyContentImageSaver(SaverLocks<string> locks)
{
private static readonly ConcurrentDictionary<string, object> LocksKeyByUrlFilename = new();

public void Save(CrawlerDbContext db, IEnumerable<ReplyPost> replies)
{
var pidAndImageList = (
Expand All @@ -18,23 +20,38 @@ public void Save(CrawlerDbContext db, IEnumerable<ReplyPost> replies)
.DistinctBy(t => (t.Pid, t.Image.UrlFilename))
.ToList();
if (pidAndImageList.Count == 0) return;

var imagesKeyByUrlFilename = pidAndImageList.Select(t => t.Image)
.DistinctBy(image => image.UrlFilename).ToDictionary(image => image.UrlFilename);

var existingImages = (
from e in db.ImageInReplies.AsTracking()
where imagesKeyByUrlFilename.Keys.Contains(e.UrlFilename)
select e)
.ToDictionary(e => e.UrlFilename);
var newImages = imagesKeyByUrlFilename.ExceptByKey(existingImages.Keys).Keys().ToList();
var newlyLocked = locks.AcquireLocks(newImages);
var alreadyLocked = newImages.Except(newlyLocked).ToList();

if (newlyLocked.Any(urlFilename => !LocksKeyByUrlFilename.TryAdd(urlFilename, new())))
throw new InvalidOperationException();
alreadyLocked.ForEach(urlFilename =>
{
lock (LocksKeyByUrlFilename[urlFilename]) { }
});
existingImages = existingImages
.Concat((
from e in db.ImageInReplies.AsTracking()
where alreadyLocked.Contains(e.UrlFilename)
select e).ToDictionary(e => e.UrlFilename))
.ToDictionary();

(from existing in existingImages.Values
where existing.ExpectedByteSize == 0 // randomly respond with 0
join newInContent in imagesKeyByUrlFilename.Values
on existing.UrlFilename equals newInContent.UrlFilename
select (existing, newInContent))
.ForEach(t => t.existing.ExpectedByteSize = t.newInContent.ExpectedByteSize);
var newImagesUrlFilename = imagesKeyByUrlFilename.ExceptByKey(existingImages.Keys).Keys().ToList();
db.ReplyContentImages.AddRange(pidAndImageList
.ExceptBy(locks.AcquireLocks(newImagesUrlFilename), t => t.Image.UrlFilename)
.Select(t => new ReplyContentImage
{
Pid = t.Pid,
Expand All @@ -49,5 +66,9 @@ where imagesKeyByUrlFilename.Keys.Contains(e.UrlFilename)
? e
: imagesKeyByUrlFilename[t.Image.UrlFilename]
}));

if (newlyLocked.Any(urlFilename => !LocksKeyByUrlFilename.TryRemove(urlFilename, out _)))
throw new InvalidOperationException();
locks.Dispose();
}
}

0 comments on commit c129bf2

Please sign in to comment.