2 Commits

Author SHA1 Message Date
7e9c87de44 feat: convert png and jpeg to webp to reduce usage. 2025-03-24 22:39:10 +08:00
fda4c01c22 feat: compress command. 2025-03-24 22:38:18 +08:00
268 changed files with 124 additions and 2869 deletions

View File

@@ -15,9 +15,6 @@ trim_trailing_whitespace = true
[project.json]
indent_size = 2
[*.{yaml,yml}]
indent_size = 2
# C# and Visual Basic files
[*.{cs,vb}]
charset = utf-8-bom

3
.gitattributes vendored
View File

@@ -1,5 +1,2 @@
*.png filter=lfs diff=lfs merge=lfs -text
*.jpg filter=lfs diff=lfs merge=lfs -text
*.jpeg filter=lfs diff=lfs merge=lfs -text
*.avif filter=lfs diff=lfs merge=lfs -text
*.webp filter=lfs diff=lfs merge=lfs -text

View File

@@ -7,30 +7,27 @@ jobs:
Build-Blog-Image:
runs-on: archlinux
steps:
- name: Check out code.
uses: https://mirrors.rrricardo.top/actions/checkout.git@v4
- uses: https://mirrors.rrricardo.top/actions/checkout.git@v4
name: Check out code
with:
lfs: true
- name: Build project.
- name: Build project
run: |
cd YaeBlog
dotnet publish
- name: Build docker image.
- name: Build docker image
run: |
proxy
podman pull mcr.microsoft.com/dotnet/aspnet:9.0
unproxy
cd YaeBlog
podman build . -t ccr.ccs.tencentyun.com/jackfiled/blog --build-arg COMMIT_ID=$(git rev-parse --short=10 HEAD)
- name: Workaround to make sure podman-login working.
podman build . -t registry.cn-beijing.aliyuncs.com/jackfiled/blog:latest
- name: Workaround to make sure podman login succeed
run: |
mkdir /root/.docker
- name: Login tencent cloud docker registry.
- name: Login aliyun docker registry
uses: https://mirrors.rrricardo.top/actions/podman-login.git@v1
with:
registry: ccr.ccs.tencentyun.com
username: 100044380877
password: ${{ secrets.TENCENT_REGISTRY_PASSWORD }}
registry: registry.cn-beijing.aliyuncs.com
username: 初冬的朝阳
password: ${{ secrets.ALIYUN_PASSWORD }}
auth_file_path: /etc/containers/auth.json
- name: Push docker image.
run: podman push ccr.ccs.tencentyun.com/jackfiled/blog:latest
- name: Push docker image
run: podman push registry.cn-beijing.aliyuncs.com/jackfiled/blog:latest

View File

@@ -1,29 +0,0 @@
<div class="flex flex-wrap justify-center gap-12 max-w-md md:max-w-lg">
<div class="relative w-40 h-48 md:w-48 md:w-48 overflow-hidden
transition-all duration-300 ease-out hover:scale-125 group">
<img
src="./images/wechat-code.jpeg"
alt="微信赞赏码"
class="w-full h-full object-cover"
/>
<div class="absolute -bottom-8 left-0 right-0 text-center
text-white bg-black opacity-60 text-sm font-medium
backdrop-blur-sm group-hover:bottom-2 transition-all duration-300">
请我喝奶茶<br/>
</div>
</div>
<div class="relative w-40 h-48 md:w-48 md:h-48 overflow-hidden
transition-all duration-300 ease-out hover:scale-125 group">
<img
src="./images/alipay-code.jpeg"
alt="支付宝赞赏码"
class="w-full h-full object-cover"/>
<div class="absolute -bottom-8 left-0 right-0 text-center
text-white bg-black opacity-60 text-sm font-medium
backdrop-blur-sm group-hover:bottom-2 transition-all duration-300">
请我吃晚饭<br/>
</div>
</div>
</div>

View File

@@ -7,15 +7,11 @@
<Anchor Address="https://dotnet.microsoft.com" Text="@DotnetVersion"/>
驱动。
</p>
<p class="text-md">
Build Commit #
<Anchor Address="@BuildCommitUrl" Text="@BuildCommitId"/>
</p>
</div>
<div>
<p class="text-md">
<Anchor Address="https://beian.miit.gov.cn" Text="蜀ICP备2022004429号-1" NewPage="true"/>
<a href="https://beian.miit.gov.cn" target="_blank" class="text-black">蜀ICP备2022004429号-1</a>
</p>
</div>
</div>
@@ -23,8 +19,4 @@
@code
{
private string DotnetVersion => $".NET {Environment.Version}";
private string BuildCommitId => Environment.GetEnvironmentVariable("COMMIT_ID") ?? "local_build";
private string BuildCommitUrl => $"https://git.rrricardo.top/jackfiled/YaeBlog/commit/{BuildCommitId}";
}

View File

@@ -1,3 +1,6 @@
@using YaeBlog.Models
@inject BlogOptions Options
<div class="px-4 py-8 border border-sky-700 rounded-md bg-sky-200">
<div class="flex flex-col gap-3 text-md">
<div>
@@ -21,17 +24,6 @@
Ricardo's Blog
</a>”。
</div>
<div class="flex flex-col">
<div class="flex justify-center">
<p>如果觉得不错的话,可以支持一下作者哦~</p>
</div>
<div class="flex justify-center">
<AppreciationCode/>
</div>
</div>
</div>
</div>

View File

@@ -1,8 +1,5 @@
FROM mcr.microsoft.com/dotnet/aspnet:9.0
ARG COMMIT_ID
ENV COMMIT_ID=${COMMIT_ID}
WORKDIR /app
COPY bin/Release/net9.0/publish/ ./
COPY source/ ./source/

View File

@@ -1,12 +0,0 @@
namespace YaeBlog.Core.Exceptions;
public class BlogCommandException : Exception
{
public BlogCommandException(string message) : base(message)
{
}
public BlogCommandException(string message, Exception innerException) : base(message, innerException)
{
}
}

View File

@@ -6,7 +6,10 @@ namespace YaeBlog.Models;
public record BlogContents(ConcurrentBag<BlogContent> Drafts, ConcurrentBag<BlogContent> Posts)
: IEnumerable<BlogContent>
{
public IEnumerator<BlogContent> GetEnumerator() => Posts.Concat(Drafts).GetEnumerator();
IEnumerator<BlogContent> IEnumerable<BlogContent>.GetEnumerator()
{
return Posts.Concat(Drafts).GetEnumerator();
}
IEnumerator IEnumerable.GetEnumerator() => GetEnumerator();
public IEnumerator GetEnumerator() => ((IEnumerable<BlogContent>)this).GetEnumerator();
}

View File

@@ -39,11 +39,6 @@
{
_page = Page ?? 1;
_pageCount = Contents.Count / EssaysPerPage + 1;
(_pageCount, int reminder) = int.DivRem(Contents.Count, EssaysPerPage);
if (reminder > 0)
{
_pageCount += 1;
}
if (EssaysPerPage * _page > Contents.Count + EssaysPerPage)
{

View File

@@ -16,11 +16,11 @@ public sealed class BlogHotReloadService(
await rendererService.RenderAsync(true);
Task[] reloadTasks = [WatchFileAsync(stoppingToken)];
Task[] reloadTasks = [FileWatchTask(stoppingToken)];
await Task.WhenAll(reloadTasks);
}
private async Task WatchFileAsync(CancellationToken token)
private async Task FileWatchTask(CancellationToken token)
{
while (!token.IsCancellationRequested)
{
@@ -33,15 +33,6 @@ public sealed class BlogHotReloadService(
break;
}
FileInfo changeFileInfo = new(changeFile);
if (changeFileInfo.Name.StartsWith('.'))
{
// Ignore dot-started file and directory.
logger.LogDebug("Ignore hidden file: {}.", changeFile);
continue;
}
logger.LogInformation("{} changed, re-rendering.", changeFile);
essayContentService.Clear();
await rendererService.RenderAsync(true);

View File

@@ -109,12 +109,6 @@ public partial class EssayScanService : IEssayScanService
{
foreach (BlogResult blog in fileContents)
{
if (blog.BlogContent.Length < 4)
{
// Even not contains a legal header.
continue;
}
int endPos = blog.BlogContent.IndexOf("---", 4, StringComparison.Ordinal);
if (!blog.BlogContent.StartsWith("---") || endPos is -1 or 0)
{
@@ -127,14 +121,14 @@ public partial class EssayScanService : IEssayScanService
try
{
MarkdownMetadata metadata = _yamlDeserializer.Deserialize<MarkdownMetadata>(metadataString);
_logger.LogDebug("Scan metadata title: '{title}' for {name}.", metadata.Title, blog.BlogFile.Name);
_logger.LogDebug("Scan metadata title: '{}' for {}.", metadata.Title, blog.BlogFile.Name);
contents.Add(new BlogContent(blog.BlogFile, metadata, blog.BlogContent[(endPos + 3)..], isDraft,
blog.Images, blog.NotFoundImages));
}
catch (YamlException e)
{
_logger.LogWarning("Failed to parser metadata from {name} due to {exception}, skipping", blog.BlogFile.Name, e);
_logger.LogWarning("Failed to parser metadata from {} due to {}, skipping", blog.BlogFile.Name, e);
}
}
});

View File

@@ -56,7 +56,7 @@ public sealed class ImageCompressService(IEssayScanService essayScanService, ILo
}
CompressResult[] compressedImages = (await Task.WhenAll(from image in uncompressedImages
select Task.Run(async () => new CompressResult(image, await ConvertToWebp(image))))).ToArray();
select Task.Run(async () => new CompressResult(image, await ConvertToWebp(image.Content))))).ToArray();
compressedSize += compressedImages.Select(i => i.CompressContent.Length).Sum();
@@ -65,8 +65,7 @@ public sealed class ImageCompressService(IEssayScanService essayScanService, ILo
select r.ImageInfo with
{
File = new FileInfo(r.ImageInfo.File.FullName.Split('.')[0] + ".webp"),
Content = r.CompressContent,
MineType = "image/webp"
Content = r.CompressContent
}).ToList();
// 修改文本
string blogContent = compressedImages.Aggregate(content.Content, (c, r) =>
@@ -89,31 +88,21 @@ public sealed class ImageCompressService(IEssayScanService essayScanService, ILo
}
}
private static async Task<byte[]> ConvertToWebp(BlogImageInfo image)
private static async Task<byte[]> ConvertToWebp(byte[] image)
{
using ImageJob job = new();
BuildJobResult result = await job.Decode(MemorySource.Borrow(image.Content))
.Branch(f => f.EncodeToBytes(new WebPLosslessEncoder()))
BuildJobResult result = await job.Decode(MemorySource.Borrow(image))
.EncodeToBytes(new WebPLossyEncoder(75))
.Finish()
.InProcessAsync();
// 超过128KB的图片使用有损压缩
// 反之使用无损压缩
ArraySegment<byte>? array = result.First?.TryGetBytes();
ArraySegment<byte>? losslessImage = result.TryGet(1)?.TryGetBytes();
ArraySegment<byte>? lossyImage = result.TryGet(2)?.TryGetBytes();
if (image.Size <= 128 * 1024 && losslessImage.HasValue)
if (array.HasValue)
{
return losslessImage.Value.ToArray();
return array.Value.ToArray();
}
if (lossyImage.HasValue)
{
return lossyImage.Value.ToArray();
}
throw new BlogCommandException($"Failed to convert {image.File.Name} to webp format: return value is null.");
throw new BlogFileException();
}
}

View File

@@ -1,8 +1,7 @@
<Project Sdk="Microsoft.NET.Sdk.Web">
<ItemGroup>
<PackageReference Include="ImageFlow.NativeRuntime.ubuntu-x86_64" Version="2.1.0-rc11" Condition="$([MSBuild]::IsOsPlatform('Linux'))"/>
<PackageReference Include="ImageFlow.NativeRuntime.osx-arm64" Version="2.1.0-rc11" Condition="$([MSBuild]::IsOsPlatform('OSX'))"/>
<PackageReference Include="ImageFlow.NativeRuntime.ubuntu-x86_64" Version="2.1.0-rc11"/>
<PackageReference Include="ImageFlow.Net" Version="0.13.2"/>
<PackageReference Include="System.CommandLine" Version="2.0.0-beta4.22272.1"/>
<PackageReference Include="AngleSharp" Version="1.1.0"/>

View File

@@ -1,12 +1,11 @@
---
title: 2021年终总结
date: 2022-01-12T16:27:19.0000000
date: 2022-01-12 16:27:19
tags:
- 杂谈
- 年终总结
- 杂谈
- 年终总结
---
2021年已经过去2022年已经来临。每每一年开始的时候我都会展开一张纸或者新建一个文档思量着又是一年时光也该同诸大杂志一般写几句意味深长的话语怀念过去的时光也祝福未来的自己。可往往脑海中已是三万字的长篇落在笔头却又是一个字都没有了。
如今跨年的时候已经过去朋友圈中已经不见文案的踪影我也该重新提笔细说自己2021年中做过的种种。

Binary file not shown.

Before

Width:  |  Height:  |  Size: 129 B

After

Width:  |  Height:  |  Size: 9.8 KiB

View File

@@ -1,13 +1,12 @@
---
title: 2022年终总结
date: 2022-12-30T14:58:12.0000000
tags:
- 杂谈
- 年终总结
- 杂谈
- 年终总结
date: 2022-12-30 14:58:12
---
2022是困难的一年。我们需要为2023年做好准备。
<!--more-->

Binary file not shown.

Before

Width:  |  Height:  |  Size: 131 B

After

Width:  |  Height:  |  Size: 34 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 131 B

After

Width:  |  Height:  |  Size: 54 KiB

View File

@@ -1,11 +1,11 @@
---
title: 2022年暑假碎碎念
date: 2022-08-22T15:39:13.0000000
tags:
- 杂谈
- 杂谈
typora-root-url: 2022-summer-vacation
date: 2022-08-22 15:39:13
---
在8个月的漫长寒假的最后两个月~~也就是俗称的暑假中~~,我都干了些什么?
<!--more-->

Binary file not shown.

Before

Width:  |  Height:  |  Size: 131 B

After

Width:  |  Height:  |  Size: 134 KiB

View File

@@ -1,12 +1,11 @@
---
title: 2023年年终总结
date: 2024-02-29T20:18:19.0000000
tags:
- 杂谈
- 年终总结
- 杂谈
- 年终总结
date: 2024-2-29 20:18:19
---
虽然2023年已经过去了两个月但是年终总结还是要发的。
<!--more-->

Binary file not shown.

Before

Width:  |  Height:  |  Size: 130 B

After

Width:  |  Height:  |  Size: 16 KiB

View File

@@ -6,7 +6,6 @@ tags:
- 年终总结
---
欸,年终总结难道不是应该在新年当天发出吗,什么已经是新年第三天了?!
然而年末偶遇流感病毒,头疼脑热强如怪物,拼尽全力也无法战胜。

Binary file not shown.

Before

Width:  |  Height:  |  Size: 130 B

After

Width:  |  Height:  |  Size: 38 KiB

View File

@@ -1,11 +1,11 @@
---
title: 人生代码大作业初体验
date: 2022-07-27T11:34:49.0000000
tags:
- 杂谈
- 杂谈
typora-root-url: big-homework
date: 2022-07-27 11:34:49
---
在大学也呆了一年了,终于遇上了第一个需要多人合作的写代码项目。从四月底分组完成,任务部署下来到七月初接近尾声,在这两个多月的时间里,也算是经历了不少,学到了不少。
<!--more-->

Binary file not shown.

Before

Width:  |  Height:  |  Size: 130 B

After

Width:  |  Height:  |  Size: 16 KiB

View File

@@ -1,12 +1,12 @@
---
title: 建立博客过程的记录
date: 2022-04-08T11:52:32.0000000
typora-root-url: 建立博客过程的记录
date: 2022-04-08 11:52:32
tags:
- 技术笔记
- 技术笔记
---
当我已经在Python的浩瀚大海遨zhengzha了半个暑假后我决定尝试一下传说中程序员专用的学(zhuang)习(bi)手(fangfa)段(fa)——建立自己的个人博客。作为一个半懂不懂的Python程序员心中冒出的第一个想法自然是采用Python的Django作为开发自己的个人博客的手段。然而在阅读了[用Django搭建个人博客](https://www.dusaiphoto.com/article/2/)等的其他人搭建这类动态博客的过程记录之后我便义无反顾的转向了采用javascript开发的博客框架[Hexo](https://hexo.io)<del>说好的Python信仰呢</del>。无他,唯简单尔。
<!--more-->

Binary file not shown.

Before

Width:  |  Height:  |  Size: 129 B

After

Width:  |  Height:  |  Size: 30 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 130 B

After

Width:  |  Height:  |  Size: 38 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 130 B

After

Width:  |  Height:  |  Size: 78 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 130 B

After

Width:  |  Height:  |  Size: 14 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 130 B

After

Width:  |  Height:  |  Size: 72 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 130 B

After

Width:  |  Height:  |  Size: 14 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 130 B

After

Width:  |  Height:  |  Size: 79 KiB

View File

@@ -1,12 +1,12 @@
---
title: C项目中有关头文件的一些问题
date: 2022-05-08T11:35:19.0000000
tags:
- 技术笔记
- C/C++
- 技术笔记
- C/C++
typora-root-url: c-include-problems
date: 2022-05-08 11:35:19
---
最近在完成一门`C`语言课程的大作业,课设老师要求我们将程序分模块的开发。在编写项目头文件的时候,遇到了一些令本菜鸡大开眼界的问题。
<!--more-->

Binary file not shown.

Before

Width:  |  Height:  |  Size: 129 B

After

Width:  |  Height:  |  Size: 5.4 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 131 B

After

Width:  |  Height:  |  Size: 205 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 131 B

After

Width:  |  Height:  |  Size: 136 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 131 B

After

Width:  |  Height:  |  Size: 238 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 130 B

After

Width:  |  Height:  |  Size: 47 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 131 B

After

Width:  |  Height:  |  Size: 639 KiB

View File

@@ -1,12 +1,11 @@
---
title: 编译MediaPipe框架
date: 2022-11-11T22:20:25.0000000
tags:
- C/C++
- 技术笔记
- C/C++
- 技术笔记
date: 2022-11-11 22:20:25
---
编译MediaPipe框架。
<!--more-->

Binary file not shown.

Before

Width:  |  Height:  |  Size: 130 B

After

Width:  |  Height:  |  Size: 36 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 129 B

After

Width:  |  Height:  |  Size: 7.9 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 129 B

After

Width:  |  Height:  |  Size: 8.4 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 130 B

After

Width:  |  Height:  |  Size: 24 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 131 B

After

Width:  |  Height:  |  Size: 23 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 130 B

After

Width:  |  Height:  |  Size: 14 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 130 B

After

Width:  |  Height:  |  Size: 57 KiB

View File

@@ -1,12 +1,12 @@
---
title: 日用Linux挑战 第0篇 初见Arch Linux
date: 2023-01-15T22:23:08.0000000
tags:
- Linux
- 杂谈
- Linux
- 杂谈
date: 2023-01-15 22:23:08
typora-root-url: daily-linux-0
---
在将开发重心移到`WSL`上一年之后我最终还是决定完全抛弃Windows转向使用Linux作为我日常使用的主力系统。目前我已经使用Linux作为主力系统一个月了。
<!--more-->

Binary file not shown.

Before

Width:  |  Height:  |  Size: 131 B

After

Width:  |  Height:  |  Size: 138 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 130 B

After

Width:  |  Height:  |  Size: 73 KiB

View File

@@ -1,13 +1,13 @@
---
title: 日用Linux挑战 第2篇 Wayland
date: 2023-07-23T11:44:34.0000000
tags:
- 杂谈
- Linux
- 杂谈
- Linux
date: 2023-07-23 11:44:34
typora-root-url: daily-linux-2
---
使用`Linux`6个月我成功戒掉了原神。
<!--more-->

Binary file not shown.

Before

Width:  |  Height:  |  Size: 130 B

After

Width:  |  Height:  |  Size: 76 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 130 B

After

Width:  |  Height:  |  Size: 53 KiB

View File

@@ -1,13 +1,13 @@
---
title: 日用Linux挑战 第3篇 放弃Wayland
date: 2023-09-04T14:47:46.0000000
tags:
- 杂谈
- Linux
- 杂谈
- Linux
typora-root-url: daily-linux-3
date: 2023-09-04 14:47:46
---
成也开源,败也开源。
<!--more-->

Binary file not shown.

Before

Width:  |  Height:  |  Size: 131 B

After

Width:  |  Height:  |  Size: 155 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 130 B

After

Width:  |  Height:  |  Size: 39 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 131 B

After

Width:  |  Height:  |  Size: 197 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 131 B

After

Width:  |  Height:  |  Size: 104 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 130 B

After

Width:  |  Height:  |  Size: 60 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 130 B

After

Width:  |  Height:  |  Size: 20 KiB

View File

@@ -214,7 +214,7 @@ void Run(string name, Action<int> body)
| -------- | ------------- | ------------- | ------------ |
| 原子指令 | 2,000,000,000 | 2,000,000,000 | 22241.9848ms |
| 朴素 | 2,000,000,000 | 220,525,235 | 277.3435ms |
| 随机 | 2,000,000,000 | 2,024,587,268 | 527.5323ms |
| 随机 | 2,000,000,000 | 2,024,,587,268 | 527.5323ms |
从数据上就可以发现,新方法可以在和朴素方法接近的运行时间下获得和使用原子指令接近的实际数值,而且运行时间会随着数值的增加进一步的减少,逐渐逼近朴素方法的运行时间。
@@ -457,170 +457,4 @@ static int ConditionalSelect(bool condition, int whenTrue, int whenFalse) =>
.NET提供的一种特性就是运行时安全这其中重要的一点就是对于数组、字符串和切片在运行时进行边界检查。但是这些边界检查就会在实际生成的代码中生成大量的分支判断这会导致程序运行的效率严重下降。因此如何让编译器在能够保证访问安全的情况下消除掉部分不必要的边界检查是编译器优化中的一个重要课题。
例如在一个常用数据结构——哈希表中,通常的实现是计算键的哈希值,并利用该哈希值作为下标在数组中获得存储的对象。考虑到哈希值是一个`int`类型的变量但是哈希表中很少需要存储高达21亿对象因此往往需要对哈希值取模之后再作为数组的下标此时取模的值常常就是数组的长度。也就是说在这种情况下对于数组的访问是不可能出现越界的情况下。因此编译器可以为类似与如下的代码取消访问数组时的边界检查
```csharp
public class Tests
{
private readonly int[] _array = new int[7];
public int GetBucket() => GetBucket(_array, 42);
private static int GetBucket(int[] buckets, int hashcode) =>
buckets[(uint)hashcode % buckets.Length];
}
```
同样的,对于下面这些代码,编译器也可以取消访问数组时的边界检查:
```csharp
public class Tests
{
private readonly string _s = "\"Hello, World!\"";
public bool IsQuoted() => IsQuoted(_s);
private static bool IsQuoted(string s) =>
s.Length >= 2 && s[0] == '"' && s[^1] == '"';
}
```
### 常量折叠
常量折叠Constant Folding同样是一个编译器在生成代码时可以进行的重要优化这让编译器在计算在编译器时就可以确定的值而不是让他们留到运行时进行。最朴素的常量折叠——例如计算一个数学表达式的值——在这里不在赘述。在上面介绍函数内联时也涉及到了常量折叠的内容分层编译的引入也会使得常量折叠的应用范围变广这些都不在这里重复。
进行常量折叠优化时一个重要的问题是“教会”编译器哪些变量是常量。这方面编译器得到的提升有:
- 可以将一个字面值字符串的长度视为一个常数;
- 在进行空安全的检查时字面值字符串是必定不为空的;
- 编译器在编译时除了可以进行一些简单的数学运算,现在整个`System.Math`命名空间中提供的算法都可以在编译时进行运算;
- `static readonly`类型的字符串和数组长度被视为一个常数;
- `obj.GetType()`现在在JIT编译器明确了解类型的情况下可以被替换为一个常量
- `DateTime`等时间类型初始化时可以在编译期计算内存存储的时间。例如对于`new DateTime(2023, 9, 1)`将会直接被编译到`new DateTime(0x8DBAA7E629B4000)`。
上述这些并不能完全覆盖在.NET 6到.NET 8三个大版本之中引入的所有JIT编译器优化但是从中也可以一窥编译器优化的精巧之处。首先编译器的优化并不是一个个独立优化策略的组合而且各种优化策略的有机组合。方法的内联就是一个典型例子通过将被调用方法的内容暴露给调用者或者反过来让其他的各种优化策略发挥更大的作用。其次JIT编译器在编译优化方面可以发挥更伟大的作用。通过在程序运行时对于运行环境和程序本身有着更加深刻的理解JIT编译器可以在运行时发挥出更高的性能。
## 内存管理
.NET中的垃圾回收器GC负责管理应用程序的内存分配和释放。每当有对象新建时运行时都会将从托管堆为对象分配内存主要托管堆中还有地址空间运行时就会从托管堆为对象分配内存。不过内存并不是无限的垃圾回收器就负责执行垃圾回收来释放一些内存。垃圾回收器的优化引擎会根据所执行的分配来确定执行收回的最佳时机。
.NET中内存管理中的一个显著变更为将内存的抽象从段Segment修改为区域Region。段和区域之前最明显的区别是大小段是较大的内存——在64位的机器上一个段的大小万网是1GB、2GB或者是4GB而区域是非常小的单元在默认情况下只有4MB的大小。从宏观上来说之前的GC是为每个代的堆维持一个GB级别的内存范围而现在GC则是维持了许多个较小的内存区域这些内存区域可以被分配给各个代的堆或者其他可能涉及的堆使用。
垃圾回收器中还有两个引人注意的特性增加。第一个是动态的代提升和下降Dynamic Promotion and Demotion`DPAD`第二个是动态适应应用程序大小Dynamic Adaptive To Application Size`DATAS`)。`DPAD`特性允许GC在工作的过程中动态的设置一个区域的代数例如直接将一个可能存活时间非常长的对象配置为第2代而这在之前的GC模型中需要通过两次垃圾回收才能实现。而第二个特性`DATAS`旨在适应应用程序的内存要求即应用程序堆的大小和长期数据大小大致成正比即使在不同规格的计算机上执行相同的工作时运行时中堆的大小也是类似的。相比如下传统的服务器模式下的GC旨在提高程序的吞吐量允许内存的分配量基于吞吐量而不是应用程序的大小。`DATAS`对于各种突发类型的工作负载是非常有利的,同时通过允许堆大小按照工作负载的要求进行调整,这将让一些内存首先的环境直接受益。
### 无垃圾回收的堆
在程序中大量会涉及到使用常量字符串的情形,例如下面这个例子:
```csharp
public class Tests
{
public string GetPrefix() => "https://";
}
```
在.NET 7平台上这个方法会被JIT编译器编译之后得到下面这段本机代码
```assembly
; Tests.GetPrefix()
mov rax,126A7C01498
mov rax,[rax]
ret
; Total bytes of code 14
```
在这段代码中使用了两个`mov`指令,其中第一个指令加载存储这个字符串对象地址的地址,第二个读取该地址。从这段本机代码可以看见,尽管已经是在处理一个常量的字符串,但是编译器和运行时仍然需要为这个字符串在堆上分配一个`string`对象因为一个在堆上分配的对象在GC的控制下会在内存中发生移动编译器就不能为这个对象使用一个固定的内存地址需用从一个指定的地址读取该对象所在的地址。如果能让这个常量字符串分配在不会移动的内存区域中就能从编译器和GC两个方面上提高程序运行的效率。
为了优化这种生成周期和程序一致对象的内存管理,.NET 8中引入了一个新的堆——没有内存管理的堆。JIT编译器将会保证这些常量类型的对象将会被分配在这个堆中这种没有GC管理的堆也意味着JIT编译器可以为这些对象使用一个固定的内存地址在使用时避免掉了一次内存读取。
![Heaps where .NET Objects Live](./dotnet-performance-8/HeapsWhereNetObjectsLive.webp)
将上述提高的示例代码使用.NET 8版本进行编译得到的代码如下从中也可以看出JIT编译器生成的代码只有一条`mov`指令,避免了一次内存访问。
```assembly
; Tests.GetPrefix()
mov rax,227814EAEA8
ret
; Total bytes of code 11
```
这个没有内存管理的堆引入还可以让其他的类型受益。例如对于`typeof(T)`返回的类型对象,容易想到一个程序集中所有类型对象的生命周期应该是和程序一致的,因此也可以在这个堆上分配所有这些类型对象。`Array.Empty<T>`也可以利用类似的思路分配在这个堆上。
### 值类型
因为可以避免在堆上分配内存,值类型已经在.NET的高性能代码中得到了广泛的应用虽然频繁的内存拷贝可能带来额外的性能开销。因此编译器对于值类型的各种优化就显得至关重要。
这部分优化中一个引人注目的点是值类型的“推广”promotion这里的推广意味着将一个结果划分为组成它的各种字段来区别对待。可以利用下面这个示例代码进行理解
```csharp
public class Tests
{
private ParsedStat _stat;
[Benchmark]
public ulong GetTime()
{
ParsedStat stat = _stat;
return stat.utime + stat.stime;
}
internal struct ParsedStat
{
internal int pid;
internal string comm;
internal char state;
internal int ppid;
internal int session;
internal ulong utime;
internal ulong stime;
internal long nice;
internal ulong starttime;
internal ulong vsize;
internal long rss;
internal ulong rsslim;
}
}
```
在这段代码中有一个较大的结构类型其的大小是80个字节。在没有启用推广的条件下进行运行`GetTime`方法编译得到的本机代码如下所示。在汇编代码中将下载栈上分配一片88字节的空间再将整个结构体直接复制到当前方法的栈上在复制完成之后计算两个字段的和并返回。
```assembly
; Tests.GetTime()
push rdi
push rsi
sub rsp,58
lea rsi,[rcx+8]
lea rdi,[rsp+8]
mov ecx,0A
rep movsq
mov rax,[rsp+10]
add rax,[rsp+18]
add rsp,58
pop rsi
pop rdi
ret
; Total bytes of code 40
```
而在打开推广的情况下运行得到的本机代码如下所示:
```assembly
; Tests.GetTime()
add rcx,8
mov rax,[rcx+8]
mov rcx,[rcx+10]
add rax,rcx
ret
; Total bytes of code 16
```
在这段汇编代码中JIT编译器只复制了两个需要使用的字段到当前方法的栈上这就大幅减少了值类型在方法调用之前产生内存复制开销。
## 还有更多……
行文至此,本篇已经字数超过一万字了,毫无疑问这将成为博客历史上最长的一篇文章。在这点字数中我们还只是**简略**的介绍了一下.NET平台过去的几个版本中涉及到的优化还主要聚焦于JIT编译器和内存管理的部分在这两个部分之后还有一个线程管理部分也是影响性能的关键组件同时.NET还提供了一个由数千个API组成的运行库这些类型中无论是基元类型还是泛型集合类型都获得了若干提升这些部分共同组成了这几个版本的性能奇迹。
本篇文章中的主要内容来自于.NET运行时仓库中的[Book of the Runtime](https://github.com/dotnet/runtime/blob/main/docs/design/coreclr/botr/README.md)和微软开发者博客上的[Performance Improvements in .NET 6](https://devblogs.microsoft.com/dotnet/performance-improvements-in-net-6/)、[Performance Improvements in .NET 7](https://devblogs.microsoft.com/dotnet/performance_improvements_in_net_7/)和[Performance Improvements in .NET 8](https://devblogs.microsoft.com/dotnet/performance-improvements-in-net-8/#whats-next)等几篇文章,上述没有覆盖到的内容推荐读者这些文章。同时算算时间,.NET 9版本引入的性能提升文章应该也要发布了。
回到文章最开始时的问题JIT编译就一定比AOT编译慢吗从启动速度上来说JIT编译当然是完败AOT编译但是在程序长时间运行各项设备JIT编译器、运行时和GC等预热完成之后则是鹿死谁手犹未可知了。
例如在一个常用数据结构——哈希表中,通常的实现是计算键的哈希值,并利用该哈希值作为下标在数组中获得存储的对象。考虑到哈希值是一个

Binary file not shown.

Before

Width:  |  Height:  |  Size: 130 B

After

Width:  |  Height:  |  Size: 24 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 129 B

After

Width:  |  Height:  |  Size: 12 KiB

View File

@@ -1,11 +1,11 @@
---
title: 环境配置备忘录
date: 2022-01-15T20:19:39.0000000
date: 2022-01-15 20:19:39
tags:
- 技术笔记
- 技术笔记
typora-root-url: 环境配置
---
电脑上的环境三天两头出问题,写下一个备忘录记录一下电脑上环境的配置过程。
<!--more-->

Binary file not shown.

Before

Width:  |  Height:  |  Size: 130 B

After

Width:  |  Height:  |  Size: 75 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 130 B

After

Width:  |  Height:  |  Size: 36 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 130 B

After

Width:  |  Height:  |  Size: 8.6 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 130 B

After

Width:  |  Height:  |  Size: 7.2 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 130 B

After

Width:  |  Height:  |  Size: 8.7 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 129 B

After

Width:  |  Height:  |  Size: 5.2 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 129 B

After

Width:  |  Height:  |  Size: 5.5 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 130 B

After

Width:  |  Height:  |  Size: 8.6 KiB

View File

@@ -32,7 +32,7 @@ tags:
![eab553f9e30d8d866a1ddd201b5e4c85.webp](./heterogeneous-programming-model/eab553f9e30d8d866a1ddd201b5e4c85.webp)
最后是异构系统中多层次数据共享和多范围同步操作带来的同步困难问题。这也可以认为是上个数据同步问题带来的后继问题在异构系统中数据可能分布在不同位置的条件下同步操作需要在众多的位置上保证共享数据的一致性这使得同步操作的范围变得十分复杂。同时在一些特定的加速设备中例如GPU可能还会有局部的硬件同步机制这更加提高了在异构系统的同步操作的设计和实现难度。
最后是异构系统中多层次数据共享和多范围同步操作带来的同步困难问题。这也可以认为是上个数据同步问题带来的后继问题:在异构系统中数据可能分布在不同位置的条件下,同步操作需要在众多的位置上保证共享数据的一致性,这使得同步操作的范围变得十分复杂。同时,在一些特定的加速设备中,<EFBFBD>例如GPU可能还会有局部的硬件同步机制这更加提高了在异构系统的同步操作的设计和实现难度。
上层应用带来的挑战主要集中在缺少良好的异构抽象和统一的编程接口上。例如在CPU上进行编程时通常使用Java、Python等高级语言而在进行GPU编程时则使用各种C语言的变体其中的核心计算函数Kernel Function则通常只支持一个C语言的子集而FPGA这些硬件设备又需要使用硬件描述语言进行编程。
@@ -725,7 +725,7 @@ static std::vector<int> syclCalculateMatrix(const std::vector<int>& a, const std
const sycl::accessor bBufferAccessor(bBuffer, h, sycl::read_only);
const sycl::accessor resultBufferAccessor(resultBuffer, h, sycl::write_only);
h.parallel_for(sycl::nd_range<2>({MATRIX_SIZE, MATRIX_SIZE}, {16, 16}), [=](const sycl::nd_item<2>& item)
h.parallel_fl_for(sycl::nd_range<2>({MATRIX_SIZE, MATRIX_SIZE}, {16, 16}), [=](const sycl::nd_item<2>& item)
{
const size_t x = item.get_global_id(0);
const size_t y = item.get_global_id(1);

Binary file not shown.

Before

Width:  |  Height:  |  Size: 130 B

After

Width:  |  Height:  |  Size: 30 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 130 B

After

Width:  |  Height:  |  Size: 41 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 130 B

After

Width:  |  Height:  |  Size: 54 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 130 B

After

Width:  |  Height:  |  Size: 18 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 130 B

After

Width:  |  Height:  |  Size: 36 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 130 B

After

Width:  |  Height:  |  Size: 22 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 130 B

After

Width:  |  Height:  |  Size: 23 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 130 B

After

Width:  |  Height:  |  Size: 17 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 130 B

After

Width:  |  Height:  |  Size: 44 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 130 B

After

Width:  |  Height:  |  Size: 8.7 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 130 B

After

Width:  |  Height:  |  Size: 18 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 130 B

After

Width:  |  Height:  |  Size: 45 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 130 B

After

Width:  |  Height:  |  Size: 15 KiB

View File

@@ -1,370 +0,0 @@
---
title: High Performance Computing 25 SP NVIDIA
date: 2025-08-31T13:50:42.8639950+08:00
tags:
- 高性能计算
- 学习资料
---
Fxxk you, NVIDIA!
<!--more-->
CPU/GPU Parallelism:
Moore's Law gives you more and more transistors:
- CPU strategy: make the workload (one compute thread) run as fast as possible.
- GPU strategy: make the workload (as many threads as possible) run as fast as possible.
GPU Architecture:
- Massively Parallel
- Power Efficient
- Memory Bandwidth
- Commercially Viable Parallelism
- Not dependent on large caches for performance
![image-20250424192311202](./hpc-2025-cuda/image-20250424192311202.webp)
## Nvidia GPU Generations
- 2006: G80-based GeForce 8800
- 2008: GT200-based GeForce GTX 280
- 2010: Fermi
- 2012: Kepler
- 2014: Maxwell
- 2016: Pascal
- 2017: Volta
- 2021: Ampere
- 2022: Hopper
- 2024: Blackwell
#### 2006: G80 Terminology
SP: Streaming Processor, scalar ALU for a single CUDA thread
SPA: Stream Processor Array
SM: Streaming Multiprocessor, containing of 8 SP
TPC: Texture Processor Cluster: 2 SM + TEX
![image-20250424192825010](./hpc-2025-cuda/image-20250424192825010.webp)
Design goal: performance per millimeter
For GPUs, performance is throughput, so hide latency with computation not cache.
So this is single instruction multiple thread (SIMT).
**Thread Life Cycle**:
Grid is launched on the SPA and thread blocks are serially distributed to all the SM.
![image-20250424193125125](./hpc-2025-cuda/image-20250424193125125.webp)
**SIMT Thread Execution**:
Groups of 32 threads formed into warps. Threads in the same wraps always executing same instructions. And some threads may become inactive when code path diverges so the hardware **automatically Handles divergence**.
Warps are the primitive unit of scheduling.
> SIMT execution is an implementation choice. As sharing control logic leaves more space for ALUs.
**SM Warp Scheduling**:
SM hardware implements zero-overhead warp scheduling:
- Warps whose next instruction has its operands ready for consumption are eligible for execution.
- Eligible warps are selected for execution on a prioritized scheduling policy.
> If 4 clock cycles needed to dispatch the same instructions for all threads in a warp, and one global memory access is needed for every 4 instructions and memory latency is 200 cycles. So there should be 200 / (4 * 4) =12.5 (13) warps to fully tolerate the memory latency
The SM warp scheduling use scoreboard and similar things.
**Granularity Consideration**:
Consider that int the G80 GPU, one SM can run 768 threads and 8 thread blocks, which is the best tiles to matrix multiplication: 16 * 16 = 256 and in one SM there can be 3 thread block which fully use the threads.
### 2008: GT200 Architecture
![image-20250424195111341](./hpc-2025-cuda/image-20250424195111341.webp)
### 2010: Fermi GF100 GPU
**Fermi SM**:
![image-20250424195221886](./hpc-2025-cuda/image-20250424195221886.webp)
There are 32 cores per SM and 512 cores in total, and introduce 64KB configureable L1/ shared memory.
Decouple internal execution resource and dual issue pipelines to select two warps.
And in Fermi, the debut the Parallel Thread eXecution(PTX) 2.0 ISA.
### 2012 Kepler GK 110
![image-20250424200022880](./hpc-2025-cuda/image-20250424200022880.webp)
### 2014 Maxwell
4 GPCs and 16 SMM.
![image-20250424200330783](./hpc-2025-cuda/image-20250424200330783.webp)
### 2016 Pascal
No thing to pay attention to.
### 2017 Volta
First introduce the tensor core, which is the ASIC to calculate matrix multiplication.
### 2021 Ampere
The GA100 SM:
![image-20250508183446257](./hpc-2025-cuda/image-20250508183446257.webp)
### 2022 Hopper
Introduce the GH200 Grace Hopper Superchip:
![image-20250508183528381](./hpc-2025-cuda/image-20250508183528381.webp)
A system contains a CPU and GPU which is linked by a NVLink technology.
And this system can scale out for machine learning.
![image-20250508183724162](./hpc-2025-cuda/image-20250508183724162.webp)
Memory access across the NVLink:
- GPU to local CPU
- GPU to peer GPU
- GPU to peer CPU
![image-20250508183931464](./hpc-2025-cuda/image-20250508183931464.webp)
These operations can be handled by hardware accelerated memory coherency. Previously, there are separate page table for CPU and GPU but for GPU to access memory in both CPU and GPU, CPU and GPU can use the same page table.
![image-20250508184155087](./hpc-2025-cuda/image-20250508184155087.webp)
### 2025 Blackwell
![image-20250508184455215](./hpc-2025-cuda/image-20250508184455215.webp)
### Compute Capability
The software version to show hardware version features and specifications.
## G80 Memory Hierarchy
### Memory Space
Each thread can
- Read and write per-thread registers.
- Read and write per-thread local memory.
- Read and write pre-block shared memory.
- Read and write pre-grid global memory.
- Read only pre-grid constant memory.
- Read only pre-grid texture memory.
![image-20250508185236920](./hpc-2025-cuda/image-20250508185236920.webp)
Parallel Memory Sharing:
- Local memory is per-thread and mainly for auto variables and register spill.
- Share memory is pre-block which can be used for inter thread communication.
- Global memory is pre-application which can be used for inter grid communication.
### SM Memory Architecture
![image-20250508185812302](./hpc-2025-cuda/image-20250508185812302.webp)
Threads in a block share data and results in memory and shared memory.
Shared memory is dynamically allocated to blocks which is one of the limiting resources.
### SM Register File
Register File(RF): there are 32KB, or 8192 entries, register for each SM in G80 GPU.
The tex pipeline and local/store pipeline can read and write register file.
Registers are dynamically partitioned across all blocks assigned to the SM. Once assigned to a block the register is **not** accessible by threads in other blocks and each thread in the same block only access registers assigned to itself.
For a matrix multiplication example:
- If one thread uses 10 registers and one block has 16x16 threads, each SM can contains three thread blocks as one thread blocks need 16 * 16 * 10 =2,560 registers and 3 * 2560 < 8192.
- But if each thread need 11 registers, one SM can only contains two blocks once as 8192 < 2816 * 3.
More on dynamic partitioning: dynamic partitioning gives more flexibility to compilers and programmers.
1. A smaller number of threads that require many registers each.
2. A large number of threads that require few registers each.
So there is a tradeoff between instruction level parallelism and thread level parallelism.
### Parallel Memory Architecture
In a parallel machine, many threads access memory. So memory is divided into banks to achieve high bandwidth.
Each bank can service one address per cycle. If multiple simultaneous accesses to a bank result in a bank conflict.
Shared memory bank conflicts:
- The fast cases:
- All threads of a half-warp access different banks, there's no back conflict.
- All threads of a half-warp access the identical address ,there is no bank conflict (by broadcasting).
- The slow cases:
- Multiple threads in the same half-warp access the same bank
## Memory in Later Generations
### Fermi Architecture
**Unified Addressing Model** allows local, shared and global memory access using the same address space.
![image-20250508193756274](./hpc-2025-cuda/image-20250508193756274.webp)
**Configurable Caches** allows programmers to configure the size if L1 cache and the shared memory.
The L1 cache works as a counterpart to shared memory:
- Shared memory improves memory access for algorithms with well defined memory access.
- L1 cache improves memory access for irregular algorithms where data addresses are not known before hand.
### Pascal Architecture
**High Bandwidth Memory**: a technology which enables multiple layers of DRAM components to be integrated vertically on the package along with the GPU.
![image-20250508194350572](./hpc-2025-cuda/image-20250508194350572.webp)
**Unified Memory** provides a single and unified virtual address space for accessing all CPU and GPU memory in the system.
And the CUDA system software doesn't need to synchronize all managed memory allocations to the GPU before each kernel launch. This is enabled by **memory page faulting**.
## Advanced GPU Features
### GigaThread
Enable concurrent kernel execution:
![image-20250508195840957](./hpc-2025-cuda/image-20250508195840957.webp)
And provides dual **Streaming Data Transfer** engines to enable streaming data transfer, a.k.a direct memory access.
![image-20250508195938546](./hpc-2025-cuda/image-20250508195938546.webp)
### GPUDirect
![image-20250508200041910](./hpc-2025-cuda/image-20250508200041910.webp)
### GPU Boost
GPU Boost works through real time hardware monitoring as opposed to application based profiles. It attempts to find what is the appropriate GPU frequency and voltage for a given moment in time.
### SMX Architectural Details
Each unit contains four warp schedulers.
Scheduling functions:
- Register scoreboard for long latency operations.
- Inter-warp scheduling decisions.
- Thread block level scheduling.
### Improving Programmability
![image-20250515183524043](./hpc-2025-cuda/image-20250515183524043.webp)
**Dynamic Parallelism**: The ability to launch new grids from the GPU.
And then introduce data-dependent parallelism and dynamic work generation and even batched and nested parallelism.
The cpu controlled work batching:
- CPU program limited by single point of control.
- Can run at most 10s of threads.
- CPU is fully consumed with controlling launches.
![](./hpc-2025-cuda/image-20250515184225475.webp)
Batching via dynamic parallelism:
- Move top-level loops to GPUs.
- Run thousands of independent tasks.
- Release CPU for other work.
![image-20250515184621914](./hpc-2025-cuda/image-20250515184621914.webp)
### Grid Management Unit
![image-20250515184714663](./hpc-2025-cuda/image-20250515184714663.webp)
Fermi Concurrency:
- Up to 16 grids can run at once.
- But CUDA streams multiplex into a single queue.
- Overlap only at stream edge.
Kepler Improved Concurrency:
- Up to 32 grids can run at once.
- One work queue per stream.
- Concurrency at full-stream level.
- No inter-stream dependencies.
It is called as **Hyper-Q**.
Without Hyper-Q:
![image-20250515185019590](./hpc-2025-cuda/image-20250515185019590.webp)
With Hyper-Q:
![image-20250515185034758](./hpc-2025-cuda/image-20250515185034758.webp)
In pascal, **asynchronous concurrent computing** is introduced.
![image-20250515185801775](./hpc-2025-cuda/image-20250515185801775.webp)
### NVLink: High-Speed Node Network
![image-20250515185212184](./hpc-2025-cuda/image-20250515185212184.webp)
> The *consumer* prefix means the product is designed for gamers.
>
> The *big* prefix means the product is designed for HPC.
### Preemption
Pascal can actually preempt at the lowest level, the instruction level.
![image-20250515190244112](./hpc-2025-cuda/image-20250515190244112.webp)
### Tensor Core
Operates on a 4x4 matrix and performs: D = A x B + C.
![image-20250515190507199](./hpc-2025-cuda/image-20250515190507199.webp)
### GPU Multi-Process Scheduling
- Timeslice scheduling: single process throughput optimization.
- Multi process service: multi-process throughput optimization.
How about multi-process time slicing:
![image-20250515190703918](./hpc-2025-cuda/image-20250515190703918.webp)
Volta introduces the multi-process services:
![image-20250515191142384](./hpc-2025-cuda/image-20250515191142384.webp)

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Some files were not shown because too many files have changed in this diff Show More