3 Commits

121 changed files with 44 additions and 2482 deletions

View File

@@ -15,9 +15,6 @@ trim_trailing_whitespace = true
[project.json] [project.json]
indent_size = 2 indent_size = 2
[*.{yaml,yml}]
indent_size = 2
# C# and Visual Basic files # C# and Visual Basic files
[*.{cs,vb}] [*.{cs,vb}]
charset = utf-8-bom charset = utf-8-bom

1
.gitattributes vendored
View File

@@ -1,5 +1,4 @@
*.png filter=lfs diff=lfs merge=lfs -text *.png filter=lfs diff=lfs merge=lfs -text
*.jpg filter=lfs diff=lfs merge=lfs -text *.jpg filter=lfs diff=lfs merge=lfs -text
*.jpeg filter=lfs diff=lfs merge=lfs -text
*.avif filter=lfs diff=lfs merge=lfs -text *.avif filter=lfs diff=lfs merge=lfs -text
*.webp filter=lfs diff=lfs merge=lfs -text *.webp filter=lfs diff=lfs merge=lfs -text

View File

@@ -7,30 +7,27 @@ jobs:
Build-Blog-Image: Build-Blog-Image:
runs-on: archlinux runs-on: archlinux
steps: steps:
- name: Check out code. - uses: https://mirrors.rrricardo.top/actions/checkout.git@v4
uses: https://mirrors.rrricardo.top/actions/checkout.git@v4 name: Check out code
with: with:
lfs: true lfs: true
- name: Build project. - name: Build project
run: | run: |
cd YaeBlog cd YaeBlog
dotnet publish dotnet publish
- name: Build docker image. - name: Build docker image
run: | run: |
proxy
podman pull mcr.microsoft.com/dotnet/aspnet:9.0
unproxy
cd YaeBlog cd YaeBlog
podman build . -t ccr.ccs.tencentyun.com/jackfiled/blog --build-arg COMMIT_ID=$(git rev-parse --short=10 HEAD) podman build . -t registry.cn-beijing.aliyuncs.com/jackfiled/blog:latest
- name: Workaround to make sure podman-login working. - name: Workaround to make sure podman login succeed
run: | run: |
mkdir /root/.docker mkdir /root/.docker
- name: Login tencent cloud docker registry. - name: Login aliyun docker registry
uses: https://mirrors.rrricardo.top/actions/podman-login.git@v1 uses: https://mirrors.rrricardo.top/actions/podman-login.git@v1
with: with:
registry: ccr.ccs.tencentyun.com registry: registry.cn-beijing.aliyuncs.com
username: 100044380877 username: 初冬的朝阳
password: ${{ secrets.TENCENT_REGISTRY_PASSWORD }} password: ${{ secrets.ALIYUN_PASSWORD }}
auth_file_path: /etc/containers/auth.json auth_file_path: /etc/containers/auth.json
- name: Push docker image. - name: Push docker image
run: podman push ccr.ccs.tencentyun.com/jackfiled/blog:latest run: podman push registry.cn-beijing.aliyuncs.com/jackfiled/blog:latest

View File

@@ -1,29 +0,0 @@
<div class="flex flex-wrap justify-center gap-12 max-w-md md:max-w-lg">
<div class="relative w-40 h-48 md:w-48 md:w-48 overflow-hidden
transition-all duration-300 ease-out hover:scale-125 group">
<img
src="./images/wechat-code.jpeg"
alt="微信赞赏码"
class="w-full h-full object-cover"
/>
<div class="absolute -bottom-8 left-0 right-0 text-center
text-white bg-black opacity-60 text-sm font-medium
backdrop-blur-sm group-hover:bottom-2 transition-all duration-300">
请我喝奶茶<br/>
</div>
</div>
<div class="relative w-40 h-48 md:w-48 md:h-48 overflow-hidden
transition-all duration-300 ease-out hover:scale-125 group">
<img
src="./images/alipay-code.jpeg"
alt="支付宝赞赏码"
class="w-full h-full object-cover"/>
<div class="absolute -bottom-8 left-0 right-0 text-center
text-white bg-black opacity-60 text-sm font-medium
backdrop-blur-sm group-hover:bottom-2 transition-all duration-300">
请我吃晚饭<br/>
</div>
</div>
</div>

View File

@@ -7,15 +7,11 @@
<Anchor Address="https://dotnet.microsoft.com" Text="@DotnetVersion"/> <Anchor Address="https://dotnet.microsoft.com" Text="@DotnetVersion"/>
驱动。 驱动。
</p> </p>
<p class="text-md">
Build Commit #
<Anchor Address="@BuildCommitUrl" Text="@BuildCommitId"/>
</p>
</div> </div>
<div> <div>
<p class="text-md"> <p class="text-md">
<Anchor Address="https://beian.miit.gov.cn" Text="蜀ICP备2022004429号-1" NewPage="true"/> <a href="https://beian.miit.gov.cn" target="_blank" class="text-black">蜀ICP备2022004429号-1</a>
</p> </p>
</div> </div>
</div> </div>
@@ -23,8 +19,4 @@
@code @code
{ {
private string DotnetVersion => $".NET {Environment.Version}"; private string DotnetVersion => $".NET {Environment.Version}";
private string BuildCommitId => Environment.GetEnvironmentVariable("COMMIT_ID") ?? "local_build";
private string BuildCommitUrl => $"https://git.rrricardo.top/jackfiled/YaeBlog/commit/{BuildCommitId}";
} }

View File

@@ -1,3 +1,6 @@
@using YaeBlog.Models
@inject BlogOptions Options
<div class="px-4 py-8 border border-sky-700 rounded-md bg-sky-200"> <div class="px-4 py-8 border border-sky-700 rounded-md bg-sky-200">
<div class="flex flex-col gap-3 text-md"> <div class="flex flex-col gap-3 text-md">
<div> <div>
@@ -21,17 +24,6 @@
Ricardo's Blog Ricardo's Blog
</a>”。 </a>”。
</div> </div>
<div class="flex flex-col">
<div class="flex justify-center">
<p>如果觉得不错的话,可以支持一下作者哦~</p>
</div>
<div class="flex justify-center">
<AppreciationCode/>
</div>
</div>
</div> </div>
</div> </div>

View File

@@ -1,8 +1,5 @@
FROM mcr.microsoft.com/dotnet/aspnet:9.0 FROM mcr.microsoft.com/dotnet/aspnet:9.0
ARG COMMIT_ID
ENV COMMIT_ID=${COMMIT_ID}
WORKDIR /app WORKDIR /app
COPY bin/Release/net9.0/publish/ ./ COPY bin/Release/net9.0/publish/ ./
COPY source/ ./source/ COPY source/ ./source/

View File

@@ -6,7 +6,10 @@ namespace YaeBlog.Models;
public record BlogContents(ConcurrentBag<BlogContent> Drafts, ConcurrentBag<BlogContent> Posts) public record BlogContents(ConcurrentBag<BlogContent> Drafts, ConcurrentBag<BlogContent> Posts)
: IEnumerable<BlogContent> : IEnumerable<BlogContent>
{ {
public IEnumerator<BlogContent> GetEnumerator() => Posts.Concat(Drafts).GetEnumerator(); IEnumerator<BlogContent> IEnumerable<BlogContent>.GetEnumerator()
{
IEnumerator IEnumerable.GetEnumerator() => GetEnumerator(); return Posts.Concat(Drafts).GetEnumerator();
}
public IEnumerator GetEnumerator() => ((IEnumerable<BlogContent>)this).GetEnumerator();
} }

View File

@@ -39,11 +39,6 @@
{ {
_page = Page ?? 1; _page = Page ?? 1;
_pageCount = Contents.Count / EssaysPerPage + 1; _pageCount = Contents.Count / EssaysPerPage + 1;
(_pageCount, int reminder) = int.DivRem(Contents.Count, EssaysPerPage);
if (reminder > 0)
{
_pageCount += 1;
}
if (EssaysPerPage * _page > Contents.Count + EssaysPerPage) if (EssaysPerPage * _page > Contents.Count + EssaysPerPage)
{ {

View File

@@ -16,11 +16,11 @@ public sealed class BlogHotReloadService(
await rendererService.RenderAsync(true); await rendererService.RenderAsync(true);
Task[] reloadTasks = [WatchFileAsync(stoppingToken)]; Task[] reloadTasks = [FileWatchTask(stoppingToken)];
await Task.WhenAll(reloadTasks); await Task.WhenAll(reloadTasks);
} }
private async Task WatchFileAsync(CancellationToken token) private async Task FileWatchTask(CancellationToken token)
{ {
while (!token.IsCancellationRequested) while (!token.IsCancellationRequested)
{ {
@@ -33,15 +33,6 @@ public sealed class BlogHotReloadService(
break; break;
} }
FileInfo changeFileInfo = new(changeFile);
if (changeFileInfo.Name.StartsWith('.'))
{
// Ignore dot-started file and directory.
logger.LogDebug("Ignore hidden file: {}.", changeFile);
continue;
}
logger.LogInformation("{} changed, re-rendering.", changeFile); logger.LogInformation("{} changed, re-rendering.", changeFile);
essayContentService.Clear(); essayContentService.Clear();
await rendererService.RenderAsync(true); await rendererService.RenderAsync(true);

View File

@@ -109,12 +109,6 @@ public partial class EssayScanService : IEssayScanService
{ {
foreach (BlogResult blog in fileContents) foreach (BlogResult blog in fileContents)
{ {
if (blog.BlogContent.Length < 4)
{
// Even not contains a legal header.
continue;
}
int endPos = blog.BlogContent.IndexOf("---", 4, StringComparison.Ordinal); int endPos = blog.BlogContent.IndexOf("---", 4, StringComparison.Ordinal);
if (!blog.BlogContent.StartsWith("---") || endPos is -1 or 0) if (!blog.BlogContent.StartsWith("---") || endPos is -1 or 0)
{ {
@@ -127,14 +121,14 @@ public partial class EssayScanService : IEssayScanService
try try
{ {
MarkdownMetadata metadata = _yamlDeserializer.Deserialize<MarkdownMetadata>(metadataString); MarkdownMetadata metadata = _yamlDeserializer.Deserialize<MarkdownMetadata>(metadataString);
_logger.LogDebug("Scan metadata title: '{title}' for {name}.", metadata.Title, blog.BlogFile.Name); _logger.LogDebug("Scan metadata title: '{}' for {}.", metadata.Title, blog.BlogFile.Name);
contents.Add(new BlogContent(blog.BlogFile, metadata, blog.BlogContent[(endPos + 3)..], isDraft, contents.Add(new BlogContent(blog.BlogFile, metadata, blog.BlogContent[(endPos + 3)..], isDraft,
blog.Images, blog.NotFoundImages)); blog.Images, blog.NotFoundImages));
} }
catch (YamlException e) catch (YamlException e)
{ {
_logger.LogWarning("Failed to parser metadata from {name} due to {exception}, skipping", blog.BlogFile.Name, e); _logger.LogWarning("Failed to parser metadata from {} due to {}, skipping", blog.BlogFile.Name, e);
} }
} }
}); });

View File

@@ -1,8 +1,7 @@
<Project Sdk="Microsoft.NET.Sdk.Web"> <Project Sdk="Microsoft.NET.Sdk.Web">
<ItemGroup> <ItemGroup>
<PackageReference Include="ImageFlow.NativeRuntime.ubuntu-x86_64" Version="2.1.0-rc11" Condition="$([MSBuild]::IsOsPlatform('Linux'))"/> <PackageReference Include="ImageFlow.NativeRuntime.ubuntu-x86_64" Version="2.1.0-rc11"/>
<PackageReference Include="ImageFlow.NativeRuntime.osx-arm64" Version="2.1.0-rc11" Condition="$([MSBuild]::IsOsPlatform('OSX'))"/>
<PackageReference Include="ImageFlow.Net" Version="0.13.2"/> <PackageReference Include="ImageFlow.Net" Version="0.13.2"/>
<PackageReference Include="System.CommandLine" Version="2.0.0-beta4.22272.1"/> <PackageReference Include="System.CommandLine" Version="2.0.0-beta4.22272.1"/>
<PackageReference Include="AngleSharp" Version="1.1.0"/> <PackageReference Include="AngleSharp" Version="1.1.0"/>

View File

@@ -1,370 +0,0 @@
---
title: High Performance Computing 25 SP NVIDIA
date: 2025-08-31T13:50:42.8639950+08:00
tags:
- 高性能计算
- 学习资料
---
Fxxk you, NVIDIA!
<!--more-->
CPU/GPU Parallelism:
Moore's Law gives you more and more transistors:
- CPU strategy: make the workload (one compute thread) run as fast as possible.
- GPU strategy: make the workload (as many threads as possible) run as fast as possible.
GPU Architecture:
- Massively Parallel
- Power Efficient
- Memory Bandwidth
- Commercially Viable Parallelism
- Not dependent on large caches for performance
![image-20250424192311202](./hpc-2025-cuda/image-20250424192311202.webp)
## Nvidia GPU Generations
- 2006: G80-based GeForce 8800
- 2008: GT200-based GeForce GTX 280
- 2010: Fermi
- 2012: Kepler
- 2014: Maxwell
- 2016: Pascal
- 2017: Volta
- 2021: Ampere
- 2022: Hopper
- 2024: Blackwell
#### 2006: G80 Terminology
SP: Streaming Processor, scalar ALU for a single CUDA thread
SPA: Stream Processor Array
SM: Streaming Multiprocessor, containing of 8 SP
TPC: Texture Processor Cluster: 2 SM + TEX
![image-20250424192825010](./hpc-2025-cuda/image-20250424192825010.webp)
Design goal: performance per millimeter
For GPUs, performance is throughput, so hide latency with computation not cache.
So this is single instruction multiple thread (SIMT).
**Thread Life Cycle**:
Grid is launched on the SPA and thread blocks are serially distributed to all the SM.
![image-20250424193125125](./hpc-2025-cuda/image-20250424193125125.webp)
**SIMT Thread Execution**:
Groups of 32 threads formed into warps. Threads in the same wraps always executing same instructions. And some threads may become inactive when code path diverges so the hardware **automatically Handles divergence**.
Warps are the primitive unit of scheduling.
> SIMT execution is an implementation choice. As sharing control logic leaves more space for ALUs.
**SM Warp Scheduling**:
SM hardware implements zero-overhead warp scheduling:
- Warps whose next instruction has its operands ready for consumption are eligible for execution.
- Eligible warps are selected for execution on a prioritized scheduling policy.
> If 4 clock cycles needed to dispatch the same instructions for all threads in a warp, and one global memory access is needed for every 4 instructions and memory latency is 200 cycles. So there should be 200 / (4 * 4) =12.5 (13) warps to fully tolerate the memory latency
The SM warp scheduling use scoreboard and similar things.
**Granularity Consideration**:
Consider that int the G80 GPU, one SM can run 768 threads and 8 thread blocks, which is the best tiles to matrix multiplication: 16 * 16 = 256 and in one SM there can be 3 thread block which fully use the threads.
### 2008: GT200 Architecture
![image-20250424195111341](./hpc-2025-cuda/image-20250424195111341.webp)
### 2010: Fermi GF100 GPU
**Fermi SM**:
![image-20250424195221886](./hpc-2025-cuda/image-20250424195221886.webp)
There are 32 cores per SM and 512 cores in total, and introduce 64KB configureable L1/ shared memory.
Decouple internal execution resource and dual issue pipelines to select two warps.
And in Fermi, the debut the Parallel Thread eXecution(PTX) 2.0 ISA.
### 2012 Kepler GK 110
![image-20250424200022880](./hpc-2025-cuda/image-20250424200022880.webp)
### 2014 Maxwell
4 GPCs and 16 SMM.
![image-20250424200330783](./hpc-2025-cuda/image-20250424200330783.webp)
### 2016 Pascal
No thing to pay attention to.
### 2017 Volta
First introduce the tensor core, which is the ASIC to calculate matrix multiplication.
### 2021 Ampere
The GA100 SM:
![image-20250508183446257](./hpc-2025-cuda/image-20250508183446257.webp)
### 2022 Hopper
Introduce the GH200 Grace Hopper Superchip:
![image-20250508183528381](./hpc-2025-cuda/image-20250508183528381.webp)
A system contains a CPU and GPU which is linked by a NVLink technology.
And this system can scale out for machine learning.
![image-20250508183724162](./hpc-2025-cuda/image-20250508183724162.webp)
Memory access across the NVLink:
- GPU to local CPU
- GPU to peer GPU
- GPU to peer CPU
![image-20250508183931464](./hpc-2025-cuda/image-20250508183931464.webp)
These operations can be handled by hardware accelerated memory coherency. Previously, there are separate page table for CPU and GPU but for GPU to access memory in both CPU and GPU, CPU and GPU can use the same page table.
![image-20250508184155087](./hpc-2025-cuda/image-20250508184155087.webp)
### 2025 Blackwell
![image-20250508184455215](./hpc-2025-cuda/image-20250508184455215.webp)
### Compute Capability
The software version to show hardware version features and specifications.
## G80 Memory Hierarchy
### Memory Space
Each thread can
- Read and write per-thread registers.
- Read and write per-thread local memory.
- Read and write pre-block shared memory.
- Read and write pre-grid global memory.
- Read only pre-grid constant memory.
- Read only pre-grid texture memory.
![image-20250508185236920](./hpc-2025-cuda/image-20250508185236920.webp)
Parallel Memory Sharing:
- Local memory is per-thread and mainly for auto variables and register spill.
- Share memory is pre-block which can be used for inter thread communication.
- Global memory is pre-application which can be used for inter grid communication.
### SM Memory Architecture
![image-20250508185812302](./hpc-2025-cuda/image-20250508185812302.webp)
Threads in a block share data and results in memory and shared memory.
Shared memory is dynamically allocated to blocks which is one of the limiting resources.
### SM Register File
Register File(RF): there are 32KB, or 8192 entries, register for each SM in G80 GPU.
The tex pipeline and local/store pipeline can read and write register file.
Registers are dynamically partitioned across all blocks assigned to the SM. Once assigned to a block the register is **not** accessible by threads in other blocks and each thread in the same block only access registers assigned to itself.
For a matrix multiplication example:
- If one thread uses 10 registers and one block has 16x16 threads, each SM can contains three thread blocks as one thread blocks need 16 * 16 * 10 =2,560 registers and 3 * 2560 < 8192.
- But if each thread need 11 registers, one SM can only contains two blocks once as 8192 < 2816 * 3.
More on dynamic partitioning: dynamic partitioning gives more flexibility to compilers and programmers.
1. A smaller number of threads that require many registers each.
2. A large number of threads that require few registers each.
So there is a tradeoff between instruction level parallelism and thread level parallelism.
### Parallel Memory Architecture
In a parallel machine, many threads access memory. So memory is divided into banks to achieve high bandwidth.
Each bank can service one address per cycle. If multiple simultaneous accesses to a bank result in a bank conflict.
Shared memory bank conflicts:
- The fast cases:
- All threads of a half-warp access different banks, there's no back conflict.
- All threads of a half-warp access the identical address ,there is no bank conflict (by broadcasting).
- The slow cases:
- Multiple threads in the same half-warp access the same bank
## Memory in Later Generations
### Fermi Architecture
**Unified Addressing Model** allows local, shared and global memory access using the same address space.
![image-20250508193756274](./hpc-2025-cuda/image-20250508193756274.webp)
**Configurable Caches** allows programmers to configure the size if L1 cache and the shared memory.
The L1 cache works as a counterpart to shared memory:
- Shared memory improves memory access for algorithms with well defined memory access.
- L1 cache improves memory access for irregular algorithms where data addresses are not known before hand.
### Pascal Architecture
**High Bandwidth Memory**: a technology which enables multiple layers of DRAM components to be integrated vertically on the package along with the GPU.
![image-20250508194350572](./hpc-2025-cuda/image-20250508194350572.webp)
**Unified Memory** provides a single and unified virtual address space for accessing all CPU and GPU memory in the system.
And the CUDA system software doesn't need to synchronize all managed memory allocations to the GPU before each kernel launch. This is enabled by **memory page faulting**.
## Advanced GPU Features
### GigaThread
Enable concurrent kernel execution:
![image-20250508195840957](./hpc-2025-cuda/image-20250508195840957.webp)
And provides dual **Streaming Data Transfer** engines to enable streaming data transfer, a.k.a direct memory access.
![image-20250508195938546](./hpc-2025-cuda/image-20250508195938546.webp)
### GPUDirect
![image-20250508200041910](./hpc-2025-cuda/image-20250508200041910.webp)
### GPU Boost
GPU Boost works through real time hardware monitoring as opposed to application based profiles. It attempts to find what is the appropriate GPU frequency and voltage for a given moment in time.
### SMX Architectural Details
Each unit contains four warp schedulers.
Scheduling functions:
- Register scoreboard for long latency operations.
- Inter-warp scheduling decisions.
- Thread block level scheduling.
### Improving Programmability
![image-20250515183524043](./hpc-2025-cuda/image-20250515183524043.webp)
**Dynamic Parallelism**: The ability to launch new grids from the GPU.
And then introduce data-dependent parallelism and dynamic work generation and even batched and nested parallelism.
The cpu controlled work batching:
- CPU program limited by single point of control.
- Can run at most 10s of threads.
- CPU is fully consumed with controlling launches.
![](./hpc-2025-cuda/image-20250515184225475.webp)
Batching via dynamic parallelism:
- Move top-level loops to GPUs.
- Run thousands of independent tasks.
- Release CPU for other work.
![image-20250515184621914](./hpc-2025-cuda/image-20250515184621914.webp)
### Grid Management Unit
![image-20250515184714663](./hpc-2025-cuda/image-20250515184714663.webp)
Fermi Concurrency:
- Up to 16 grids can run at once.
- But CUDA streams multiplex into a single queue.
- Overlap only at stream edge.
Kepler Improved Concurrency:
- Up to 32 grids can run at once.
- One work queue per stream.
- Concurrency at full-stream level.
- No inter-stream dependencies.
It is called as **Hyper-Q**.
Without Hyper-Q:
![image-20250515185019590](./hpc-2025-cuda/image-20250515185019590.webp)
With Hyper-Q:
![image-20250515185034758](./hpc-2025-cuda/image-20250515185034758.webp)
In pascal, **asynchronous concurrent computing** is introduced.
![image-20250515185801775](./hpc-2025-cuda/image-20250515185801775.webp)
### NVLink: High-Speed Node Network
![image-20250515185212184](./hpc-2025-cuda/image-20250515185212184.webp)
> The *consumer* prefix means the product is designed for gamers.
>
> The *big* prefix means the product is designed for HPC.
### Preemption
Pascal can actually preempt at the lowest level, the instruction level.
![image-20250515190244112](./hpc-2025-cuda/image-20250515190244112.webp)
### Tensor Core
Operates on a 4x4 matrix and performs: D = A x B + C.
![image-20250515190507199](./hpc-2025-cuda/image-20250515190507199.webp)
### GPU Multi-Process Scheduling
- Timeslice scheduling: single process throughput optimization.
- Multi process service: multi-process throughput optimization.
How about multi-process time slicing:
![image-20250515190703918](./hpc-2025-cuda/image-20250515190703918.webp)
Volta introduces the multi-process services:
![image-20250515191142384](./hpc-2025-cuda/image-20250515191142384.webp)

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@@ -1,224 +0,0 @@
---
title: High Performance Computing 25 SP Distributed System
date: 2025-05-10T00:31:39.3109950+08:00
tags:
- 高性能计算
- 学习资料
---
The motivation of distributed system is resource sharing.
<!--more-->
### Definition of a Distributed System
- A collection of independent computers that appears to its users as a single coherent system.
- A system in which hardware and software components located at networked computers communicated and coordinate their actions on by message passing.
Important aspects:
- Components are autonomous.
- Virtually single systems as **transparency**.
### Kinds of Systems
**Clustering**:
A cluster is a group of independent resources that are interconnected and work as a single system.
A general prerequisite of hardware clustering is that its component systems have reasonably identical hardware and operating system to provide similar performance levels when one failed component is to be replaced by another.
**Peer-to-Peer Network**:
P2P system are quite popular for file sharing, content distribution and Internet telephony.
**Grid Computing**:
A computing grid (or *computational grid* ) provides a platform in which computing resources are organized into one or more logical pools.
**Cloud Computing**:
Enables clients to outsource their software usage, data storage and even the computing infrastructure to remote data centers.
![image-20250410193527994](./hpc-2025-distributed-system/image-20250410193527994.webp)
**Fog Computing**:
Fog computing focuses processing efforts at the local area network end of the chain.
**Edge Computing**:
Edge computing takes localized processing a bit farther, push these efforts closer to the data sources.
**Near Resources Computing**:
While CPU becomes powerful, I/O devices too. So offload CPU for domain-specific computing.
### Features of Distributed System
**Transparency**:
- Access transparency.
- Location transparency.
- Migration transparency.
- Relocation transparency.
- Replication transparency.
- Concurrency transparency.
- Failure transparency.
**Openness**:
- Open distributed systems: offer services according to standard rules that describe the syntax and semantics of those services.
- Services are specified through *interfaces*.
**Scalability**:
Size scalability: more users and more resources.
- Centralized services: a single server for all users.
- Centralized data: a single on-line telephone book.
- Centralized algorithms: doing routing based on completed information.
### Common Problems is Distributed Systems
1. Leader Election
2. Mutual Exclusion
3. Time Synchronization
4. Global State
5. Multicasting
6. Replica Management
### Time in Distributed Systems
Atomic clocks: modern timekeepers use atomic clocks as a de facto primary standard of time.
**Happened Before Relationship**:
Three basic rules about the causal ordering of events, and they collectively define the *happened before* a.k.a the *causally ordered before* relationship.
- Rule 1: Let each process have a physical clock whose value is monotonically increasing.
- Rule 2: If *a* is the event of sending a message by process *P*, and *b* is the event of receiving the same message by another process *Q*, so the a < b.
- Rule 3: a < b and b < c can lead to a < c.
The space time diagrams show such relationship:
![image-20250417184421464](./hpc-2025-distributed-system/image-20250417184421464.webp)
**Logical Clocks**:
A logical clock is an event counter that respects causal ordering.
**Vector Clocks**:
The primary goal of vector clocks is to detect causality, which is the major weakness of logical clocks.
![image-20250424183610157](./hpc-2025-distributed-system/image-20250424183610157.webp)
![image-20250424183629681](./hpc-2025-distributed-system/image-20250424183629681.webp)
![image-20250424183645210](./hpc-2025-distributed-system/image-20250424183645210.webp)
**Synchronization Classification**:
Types of synchronization:
- External synchronization
- Internal synchronization
- Phase synchronization
> Types of clocks:
>
> - Unbounded
> - Bounded
>
> Unbounded clocks are not realistic but are easier to deal with in the design of algorithms. Real clocks are always bounded.
**External Synchronization**:
To maintain the reading of each clock as close to the UTC as possible.
The NTP is an external synchronization protocol.
**Internal Synchronization**:
To keep the readings of a system of autonomous clocks closely synchronized with one another, despite the failure or malfunction of one or more clocks.
Of course external synchronization implies internal synchronization.
**Phase Synchronization**:
Many distributed computations run in phases: in a given phase all processes execute some actions which are followed by the next phase.
## Data Center Organization
A data center is a facility used to house computer systems and associated components.
![image-20250417185200176](./hpc-2025-distributed-system/image-20250417185200176.webp)
## Cloud Computing
Cloud computing is a specialized form of distributed computing that introduces utilization models for remotely provisioning scalable and measured resources.
>**NIST definition**:
>
>Cloud computing is a model for enabling ubiquitous, convenient, on-demand network access to a shared pool of configurable computing resources (e.g., networks, servers, storage, applications, and services) that can be rapidly provisioned and released with minimal management effort or service provider interaction. This cloud model is composed of five essential characteristics, three service models, and four deployment models.
![image-20250417190247790](./hpc-2025-distributed-system/image-20250417190247790.webp)
**Cloud Characteristics**:
- On-demand Usage
- Ubiquitous Access
- Multitenancy
- Elasticity
- Measure Usage
- Resiliency
**Cloud Delivery Models**:
A cloud service delivery model represents a specific pre-packaged combination of IT resources offered by a cloud provider.
- Infrastructure as a Service `IaaS`
- Platform as a a Service `PaaS`
- Software as a Service `SaaS`
**Hypervisor**:
Type 1 hypervisor:
![image-20250417191509682](./hpc-2025-distributed-system/image-20250417191509682.webp)
Type 2 hypervisor:
![image-20250417191526416](./hpc-2025-distributed-system/image-20250417191526416.webp)
**CPU Virtualization**:
Inter VT-X and AMD SVM:
- Introduce virtualization technology processors with an extra instruction set called Virtual Machine Extensions or VMX.
- Add additional operating model for host and guest.
- Support for swapping state between guest and host.
- Support for hiding privileged state.
![image-20250417192453944](./hpc-2025-distributed-system/image-20250417192453944.webp)
## Big Data Processing
**MapReduce Programming Model**
MapReduce is based on a very simple idea for parallel processing of data-intensive applications supporting arbitrarily divisible load sharing.
> The so-called same process multiple data (SPMD) paradigm.
**MapReduce Logical Data Flow**:
The input data and output data of both the Map and reduce functions has a particular structure.
Sending computation toward data rather than sending data toward computation.
**Resilient Distributed Dataset**
An RDD is a read-only partitioned collection of records.

View File

@@ -1,80 +0,0 @@
---
title: High Performance Computing 25 SP Heterogeneous Computing
date: 2025-05-10T00:36:20.5391570+08:00
tags:
- 高性能计算
- 学习资料
---
Heterogeneous Computing is on the way!
<!--more-->
## GPU Computing Ecosystem
CUDA: NVIDIA's Architecture for GPU computing.
![image-20250417195644624](./hpc-2025-heterogeneous-system/image-20250417195644624.webp)
## Internal Buses
**HyperTransport**:
Primarily a low latency direct chip to chip interconnect, supports mapping to board to board interconnect such as PCIe.
**PCI Expression**
Switched and point-to-point connection.
**NVLink**
![image-20250417200241703](./hpc-2025-heterogeneous-system/image-20250417200241703.webp)
**OpenCAPI**
Heterogeneous computing was in the professional world mostly limited to HPC, in the consumer world is a "nice to have".
But OpenCAPI is absorbed by CXL.
## CPU-GPU Arrangement
![image-20250424184701573](./hpc-2025-heterogeneous-system/image-20250424184701573.webp)
#### First Stage: Intel Northbrige
![image-20250424185022360](./hpc-2025-heterogeneous-system/image-20250424185022360.webp)
### Second Stage: Symmetric Multiprocessors:
![image-20250424185048036](./hpc-2025-heterogeneous-system/image-20250424185048036.webp)
### Third Stage: Nonuniform Memory Access
And the memory controller is integrated directly in the CPU.
![image-20250424185152081](./hpc-2025-heterogeneous-system/image-20250424185152081.webp)
So in such context, the multiple CPUs is called NUMA:
![image-20250424185219673](./hpc-2025-heterogeneous-system/image-20250424185219673.webp)
And so there can be multi GPUs:
![image-20250424185322963](./hpc-2025-heterogeneous-system/image-20250424185322963.webp)
### Fourth Stage: Integrated PCIe in CPU
![image-20250424185354247](./hpc-2025-heterogeneous-system/image-20250424185354247.webp)
And there is such team *integrated CPU*, which integrated a GPU into the CPU chipset.
![image-20250424185449577](./hpc-2025-heterogeneous-system/image-20250424185449577.webp)
And the integrated GPU can work with discrete GPUs:
![image-20250424185541483](./hpc-2025-heterogeneous-system/image-20250424185541483.webp)
### Final Stage: Multi GPU Board
![image-20250424190159059](./hpc-2025-heterogeneous-system/image-20250424190159059.webp)

View File

@@ -1,241 +0,0 @@
---
title: High Performance Computing 25 SP Non Stored Program Computing
date: 2025-08-31T13:51:17.5260660+08:00
tags:
- 高性能计算
- 学习资料
---
No Von Neumann Machines.
<!--more-->
## Application Specified Integrated Circuits
As known as **ASIC**, these hardwares can work along and are not von Neumann machines.
No stored program concept:
- Input data come in
- Pass through all circuit gates quickly
- Generate output results immediately
Advantages: performance is better.
Disadvantages: reusability is worse.
> The CPU and GPU are special kinds of ASIC.
Why we need ASIC in computing:
- Alternatives to the Moore'a law.
- High capacity and high speed.
![image-20250605185212740](./hpc-2025-non-stored-program-computing/image-20250605185212740.webp)
### Full Custom ASICs
All mask layers are customized in a full-custom ASICs.
The full-custom ASICs always can offer the highest performance and lowest part cost (smallest die size) for a given design.
A typical example of full-custom ASICs is the CPU.
The advantages and disadvantages of full-custom ASICs is shown below.
| Advantages | Disadvantages |
| ------------------------------------------------------------ | -------------------------------------------------------- |
| Reducing the area | The design process takes a longer time |
| Enhancing the performance | Having more complexity in computer-aided design tool |
| Better ability of integrating with other analog components and other pre-designed components | Requiring higher investment and skilled human resources. |
### Semi Custom ASICs
All the logical cell are predesigned and some or all of the mask layer is customized.
There are two types of semi-custom ASICs:
- Standard cell based ASICs
- Gate-array based ASICs.
The Standard cell based ASICs is also called as **Cell-based ASIC(CBIC)**.
![image-20250815093113115](./hpc-2025-non-stored-program-computing/image-20250815093113115.webp)
> The *gate* is used a unit to measure the ability of semiconductor to store logical elements.
The semi-custom ASICs is developed as:
- Programmable Logic Array(PLA)
- Complex Programmable Logical Device(CPLD)
- Programmable Array Logical
- Field Programing Gate Array(FPGA)
#### Programmable Logical Device
An integrated circuit that can be programmed/reprogrammed with a digital logical of a curtain level.
The basic idea of PLD is an array of **AND** gates and an array of **OR** gates. Each input feeds both a non-inverting buffer and an inverting buffer to produce the true and inverted forms of each variable. The AND outputs are called the product lines. Each product line is connected to one of the inputs of each OR gate.
Depending on the structure, the standard PLD can be divided into:
- Read Only Memory(ROM): A fixed array of AND gates and a programmable array of OR gates.
- Programmable Array Logic(PAL): A programmable array of AND gates feeding a fixed array of OR gates.
- Programmable Logic Array(PLA): A programmable array of AND gates feeding a programmable of OR gates.
- Complex Programmable Logic Device(CPLD) and Field Programmable Gate Array(FPGA): complex enough to be called as *architecture*.
![image-20250817183832472](./hpc-2025-non-stored-program-computing/image-20250817183832472.webp)
## Field Programming Gate Array
> General speaking, all semiconductor can be considered as a special kind of ASIC. But in practice, we always refer the circuit with a special function as ASIC, a circuit that can change the function as FPGA.
![image-20250612184120333](./hpc-2025-non-stored-program-computing/image-20250612184120333.webp)
### FPGA Architecture
![image-20250817184419856](./hpc-2025-non-stored-program-computing/image-20250817184419856.webp)
#### Configurable Logic Block(CLB) Architecture
The CLB consists of:
- Look-up Table(LUT): implements the entries of a logic functions truth table.
And some FPGAs can use the LUTs to implement small random access memory(RAM).
- Carry and Control Logic: Implements fast arithmetic operation(adders/subtractors).
- Memory Elements: configures flip flops/latches (programmable clock edges, set, reset and clock enable). These memory elements usually can be configured as shift-registers.
##### Configuring LUTs
LUT is a ram with data width of 1 bit and the content is programmed at power up. Internal signals connect to control signals of MUXs to select a values of the truth tables for any given input signals.
The below figure shows LUT working:
![image-20250817185111521](./hpc-2025-non-stored-program-computing/image-20250817185111521.webp)
The configuration memory holds the output of truth table entries, so that when the FPGA is restarting it will run with the same *program*.
And as the truth table entries are just bits, the program of FPGA is called as **BITSTREAM**, we download a bitstream to an FPGA and all LUTs will be configured using the BITSTREAM to implement the boolean logic.
##### LUT Based Ram
Let the input signal as address, the LUT will be configured as a RAM. Normally, LUT mode performs read operations, the address decoders can generate clock signal to latches for writing operation.
![image-20250817185859510](./hpc-2025-non-stored-program-computing/image-20250817185859510.webp)
#### Routing Architecture
The logic blocks are connected to each though programmable routing network. And the routing network provides routing connections among logic blocks and I/O blocks to complete a user-designed circuit.
Horizontal and vertical mesh or wire segments interconnection by programmable switches called programmable interconnect points(PIPs).
![image-20250817192006784](./hpc-2025-non-stored-program-computing/image-20250817192006784.webp)
These PIPs are implemented using a transmission gate controlled by a memory bits from the configuration memory.
Several types of PIPs are used in the FPGA:
- Cross-point: connects vertical or horizontal wire segments allowing turns.
- Breakpoint: connects or isolates 2 wire segments.
- Decoded MUX: groups of cross-points connected to a single output configured by n configuration bits.
- Non-decoded MUX: n wire segments each with a configuration bit.
- Compound cross-point: 6 breakpoint PIPs and can isolate two isolated signal nets.
![image-20250817194355228](./hpc-2025-non-stored-program-computing/image-20250817194355228.webp)
#### Input/Output Architecture
The I/O pad and surrounding supporting logical and circuitry are referred as input/input cell.
The programmable Input/Output cells consists of three parts:
- Bi-directional buffers
- Routing resources.
- Programmable I/O voltage and current levels.
![image-20250817195139631](./hpc-2025-non-stored-program-computing/image-20250817195139631.webp)
#### Fine-grained and Coarse-grained Architecture
The fine-grained architecture:
- Each logic block can implement a very simple function.
- Very efficient in implementing systolic algorithms.
- Has a large number of interconnects per logic block than the functionality they offer.
The coarse-grained architecture:
- Each logic block is relatively packed with more logic.
- Has their logic blocks packed with more functionality.
- Has fewer interconnections which leading to reduce the propagating delays encountered.
#### Interconnect Devices
FPGAs are based on an array of logic modules and uncommitted wires to route signal.
Three types of interconnected devices have been commonly used to connect there wires:
- Static random access memory (SRAM) based
- Anti-fuse based
- EEPROM based
### FPGA Design Flow
![image-20250817195714935](./hpc-2025-non-stored-program-computing/image-20250817195714935.webp)
![image-20250817200350750](./hpc-2025-non-stored-program-computing/image-20250817200350750.webp)
The FPGA configuration techniques contains:
- Full configuration and read back.
- Partial re-configuration and read back.
- Compressed configuration.
Based on the partially reconfiguration, the runtime reconfiguration is development. The area to be reconfigured is changed based on run-time.
#### Hardware Description Languages(HDL)
There are three languages targeting FPGAs:
- VHDL: VHSIC Hardware Description Language.
- Verilog
- OpenCL
The first two language are typical HDL:
| Verilog | VHDL |
| -------------------------------------- | ------------------------------- |
| Has fixed data types. | Has abstract data types. |
| Relatively easy to learn. | Relatively difficult to learn. |
| Good gate level timing. | Poor level gate timing. |
| Interpreted constructs. | Compiled constructs. |
| Limited design reusability. | Good design reusability. |
| Doesn't support structure replication. | Supports structure replication. |
| Limited design management. | Good design management. |
The OpenCL is not an traditional hardare description language. And OpenCL needs to turn the thread parallelism into hardware parallelism, called **pipeline parallelism**.
The follow figure shows how the OpenCL-FPGA compiler turns an vector adding function into the circuit.
![image-20250829210329225](./hpc-2025-non-stored-program-computing/image-20250829210329225.webp)
The compiler generates three stages for this function:
1. In the first stage, two loading units are used.
2. In the second stage, one adding unit is used.
3. In the third stage, one storing unit is used.
Once cycle, the thread `N` is clocked in the first stage, loading values from the array meanwhile, the thread `N - 1` is in the second stage, adding values from the array and the thread `N - 2` is in the third stage, storing value into the target array.
So different from the CPU and GPU, the OpenCL on the FPGA has two levels of parallelism:
- Pipelining
- Replication of the kernels and having them run concurrently.

View File

@@ -1,100 +0,0 @@
---
title: High Performance Computing 25 SP OpenCL Programming
date: 2025-08-31T13:51:02.0181970+08:00
tags:
- 高性能计算
- 学习资料
---
Open Computing Language.
<!--more-->
OpenCL is Open Computing Language.
- Open, royalty-free standard C-language extension.
- For parallel programming of heterogeneous systems using GPUs, CPUs , CBE, DSP and other processors including embedded mobile devices.
- Managed by Khronos Group.
![image-20250529185915068](./hpc-2025-opencl/image-20250529185915068.webp)
### Anatomy of OpenCL
- Platform Layer APi
- Runtime Api
- Language Specification
### Compilation Model
OpenCL uses dynamic/runtime compilation model like OpenGL.
1. The code is compiled to an IR.
2. The IR is compiled to a machine code for execution.
And in dynamic compilation, *step 1* is done usually once and the IR is stored. The app loads the IR and performs *step 2* during the app runtime.
### Execution Model
OpenCL program is divided into
- Kernel: basic unit of executable code.
- Host: collection of compute kernels and internal functions.
The host program invokes a kernel over an index space called an **NDRange**.
NDRange is *N-Dimensional Range*, and can be a 1, 2, 3-dimensional space.
A single kernel instance at a point of this index space is called **work item**. Work items are further grouped into **work groups**.
### OpenCL Memory Model
![image-20250529191215424](./hpc-2025-opencl/image-20250529191215424.webp)
Multiple distinct address spaces: Address can be collapsed depending on the device's memory subsystem.
Address space:
- Private: private to a work item.
- Local: local to a work group.
- Global: accessible by all work items in all work groups.
- Constant: read only global memory.
> Comparison with CUDA:
>
> ![image-20250529191414250](./hpc-2025-opencl/image-20250529191414250.webp)
Memory region for host and kernel:
![image-20250529191512490](./hpc-2025-opencl/image-20250529191512490.webp)
### Programming Model
#### Data Parallel Programming Model
1. Define N-Dimensional computation domain
2. Work-items can be grouped together as *work group*.
3. Execute multiple work-groups in parallel.
#### Task Parallel Programming Model
> Data parallel execution model must be implemented by all OpenCL computing devices, but task parallel programming is a choice for vendor.
Some computing devices such as CPUs can also execute task-parallel computing kernels.
- Executes as s single work item.
- A computing kernel written in OpenCL.
- A native function.
### OpenCL Framework
![image-20250529192022613](./hpc-2025-opencl/image-20250529192022613.webp)
The basic OpenCL program structure:
![image-20250529192056388](./hpc-2025-opencl/image-20250529192056388.webp)
**Contexts** are used to contain the manage the state of the *world*.
**Command-queue** coordinates execution of the kernels.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@@ -1,366 +0,0 @@
---
title: High Performance Computing 25 SP Dichotomy of Parallel Computing Platforms
date: 2025-03-28T01:03:32.2187720+08:00
tags:
- 高性能计算
- 学习资料
---
Designing algorithms is always the hardest.
<!--more-->
Flynn's classical taxonomy:
- SISD
- SIMD
- MISD
- MIMD
Multiple instruction and multiple data is currently the most common type of parallel computer.
> A variant: single program multiple data(SPMD).
## Dichotomy of Parallel Computing Platforms
Based on the logical and physical organization of parallel platforms.
Logical organization (from a programmer's perspective):
- Control structure: ways of expressing parallel tasks.
- Communication model: interactions between tasks.
Hardware organization:
- Architecture
- Interconnection networks.
Control Structure of Parallel Platform: parallel tasks can be specified at various levels of granularity.
Communication Model: **Shared address space platforms**. Support a common data space that is accessible to all processors. Two types of architectures:
- Uniform memory access (UMA)
- Non-uniform memory access(NUMA)
> NUMA and UMA are defined in term of memory access times not the cache access times.
![image-20250313193604905](./hpc-2025-parallel-computing/image-20250313193604905.webp)
NUMA and UMA:
- The distinction between NUMA and UMA platforms is important from the point of view of algorithm design.
- Programming these platforms is easier since reading and writing are implicitly visible to other processors.
- Caches is such machines requires coordinated access to multiple copies.
> Leads to cache coherence problem.
- A weaker model of these machines provides an address map but not coordinated access.
**Global Memory Space**:
- Easy to program.
- Read-only interactions:
Invisible to programmers.
Same as in serial programs.
- Read/write interactions:
Mutual exclusion for concurrent access such as lock and related mechanisms.
- Programming paradigms: Threads/Directives.
Caches in shared-address-space:
- Address translation mechanism to locate a memory word in the system.
- Well-defined semantics over multiple copies(**cache coherence**).
> Shared-address-space vs shared memory machine:
>
> Shared address space is a programming abstraction.
>
> Shared memory machine is a physical machine attribute.
Distributed Shared Memory(DSM) or Shared Virtual Memory(SVM):
- Page-based access control: leverage the virtual memory support and manage main memory as a fully associative cache on the virtual address space by embedding a coherence protocol in the page fault handler.
- Object based access control: flexible but no false sharing.
## Parallel Algorithm Design
Steps in parallel algorithm design:
- Identifying portions of the work that can be performed concurrently.
- Mapping the concurrent pieces of work onto multiple processors running in parallel.
- Distributing the input, output and intermediate data associated with the program.
- Managing accesses to data shared by multiple processors.
- Synchronizing the processors at various stages of the parallel program execution.
### Decomposition
Dividing a computation into smaller parts some or all of which may be executed in parallel.
Tasks: programmer-defined units with arbitrary size and is indivisible.
Aim: **reducing execution time**
Ideal decomposition:
- All tasks have similar size.
- Tasks are **not** waiting for each other **not** sharing resources.
Dependency graphs:
Task dependency graph: an abstraction to express dependencies among tasks and their relative order of execution.
- Directed acyclic graphs.
- Nodes are tasks.
- Directed edges: dependencies amongst tasks.
> The fewer directed edges, the better as parallelism.
Granularity:
The granularity of the decomposition: the number and size of tasks into which a problem is decomposed.
- Fine-grained: a large number of small tasks.
- Coarse-grained: a small number of large tasks.
Concurrency:
**maximum degree of concurrency**
**Average degree of concurrency**
The critical path determines the average degree of concurrency.
Critical path is the longest directed path between any pair of start and finish nodes. So a shorter critical path favors a higher degree of concurrency.
**Limited Granularity**:
It may appear that increasing the granularity of decomposition will utilize the resulting concurrency.
But there is a inherent bound on how fine-grained a decomposition a problem permits.
Speedup:
The ratio of serial to parallel execution time. Restrictions on obtaining unbounded speedup from:
- Limited granularity.
- Degree of concurrency.
- Interaction among tasks running on different physical processors.
Processor:
Computing agent that performs tasks, an abstract entity that uses the code and data of a tasks to produce the output of the task within a finite amount of time.
Mapping: the mechanism by which tasks are assigned to processor for execution. The task dependency and task interaction graphs play an important role.
Decomposition techniques:
Fundamental steps: split the computations to be performed into a set of tasks for concurrent execution.
1. Recursive decomposition.
A method for inducing concurrency in problems that can be solved using the **divide-and-conquer** strategy.
2. Data decomposition.
A method for deriving concurrency in algorithms that operate on large data structures.
The operations performed by these tasks on different data partitions.
Can be partitioning output data and partitioning input data or even partitioning intermediate data.
3. Exploratory decomposition.
Decompose problems whose underlying computations correspond to a search of a space for solutions.
Exploratory decomposition appears similar to data decomposition.
4. Speculative decomposition.
Used when a program may take one of many possible computationally significant branches depending on the output of preceding computation.
Similar to evaluating branches in a *switch* statement in `C` as evaluate multiple branches in parallel and correct branch will be used and other branches will be discarded.
The parallel run time is smaller than the serial run time by the amount of time to evaluate the condition.
### Characteristics of Tasks
**Task generation**:
- Static: all the tasks are known before the algorithm starts executing.
- Dynamic: the actual tasks and the task dependency graph are not explicitly available at priori.
- Either static or dynamic.
**Task Sizes**:
The relative amount of time required t complete the task.
- Uniform
- Non-uniform
The knowledge of task sizes will influence the choice of mapping scheme.
**Inter-Task Interactions**:
- Static versus dynamic.
- Regular versus irregular.
- Read-only versus read-write
- One-way versus two-way.
### Mapping Techniques
Mapping techniques is for loading balancing.
Good mappings:
- Reduce the interaction time.
- Reduce the idle time.
![image-20250320200524155](./hpc-2025-parallel-computing/image-20250320200524155.webp)
There are two mapping methods:
- **Static Mapping**: determined by programming paradigm and the characteristics of tasks and interactions.
Static mapping is often used in conjunction with *data partitioning* and *task partitioning*.
- **Dynamic Mapping**: distribute the work among processors during the execution. Also referred as dynamic load-balancing.
The **centralized scheme** as all the executable tasks are maintained in a common central data structure and distributed by a special process or a subset of processes as **master** process.
Centralized scheme always means easy to implement but with limited scalability.
The **distributed scheme** as the set of executable tasks are distributed among processes which exchange tasks at run time to balance work.
**Minimize frequency of interactions**:
There is a relatively high startup cost associated with each interaction on many architectures.
So restructure the algorithm such that shared data are accessed and used in large pieces.
**Minimize contention and hot spots**:
Contention occurs when multiple tasks try to access the same resources concurrently.
And centralized scheme for dynamic mapping are a frequent source of contention so use the distributed mapping schemes.
**Overlapping computations with interactions**:
When waiting for shared data, do some useful computations.
- Initiate an interaction early enough to complete before it needed.
- In dynamic mapping schemes, the process can anticipate that it is going to run out of work and initiate a work which transfers interaction in advance.
Overlapping computations with interaction requires support from the programming paradigm, the operating system and the hardware.
- Disjoint address-space paradigm: non-blocking message passing primitives.
- Share address-space paradigm: prefetching hardware which can anticipate the memory addresses and initiate access in advance of when they are needed.
**Replicating data or computations**:
Multiple processors may require frequent read-only access to shared data structure such as a hash-table.
For different paradigm:
- Share address space use cache.
- Message passing: remote data accesses are more expensive and harder than local accesses.
Data replication increases the memory requirements. In some situation, it may be more cost-effective to compute these intermediate results than to get then from another place.
**Using optimized collective interaction operations**:
Collective operations are like:
- Broadcasting some data to all processes.
- Adding up numbers each belonging to a different process.
### Parallel Algorithm Model
The way of structuring parallel algorithm by
- Selecting a decomposition
- Selecting a mapping technique.
- Applying the appropriate strategy to minimize interactions.
**Data parallel model**:
The tasks are statically or semi-statically mapped onto processes and each task performs similar operations on different data.
Example: matrix multiplication.
**Task graph model**:
The interrelations among the tasks are utilized to promote locality or to reduce interaction costs.
Example: quick sort, sparse matrix factorization and many other algorithms using divide-and-conquer decomposition.
**Work pool model**:
Characterized by a dynamic mapping of task onto processes for load balancing.
Example: parallelization of loops by chunk scheduling.
**Master-slave model** :
One or more master processes generate work and allocate it to worker processes.
**Pipeline or producer-consumer model**:
A stream of data is passed on through a succession of processes, each of which performs some tasks.
### Analytical Modeling of Parallel Programs
**Performance evaluation**:
Evaluation in terms of execution time.
A parallel system is the combination of an algorithm and the parallel architecture on which it is implemented.
**Sources of overhead in parallel program**:
A typical execution includes:
- Essential computation
Computation that world be performed by the serial program for solving the same problem instance.
- Interprocess communication
- Idling
- Excess computation
Computation which not performed by the serial program.
**Performance metrics for parallel system**:
- Execution time
- Overhead function
- Total overhead
- Speedup
> For a given problem, more than one sequential algorithm may be available.
Theoretically speaking, speed up can never exceed the number of PE.
If super linear speedup: the work performed by a serial program is greater than its parallel formulation, maybe hardware features that put the serial implementation at a disadvantage.
**Amdahl's Law**:
![image-20250327194045418](./hpc-2025-parallel-computing/image-20250327194045418.webp)
The overall performance improvement gained by optimizing a single part of a system is limited by the fraction of time that the improved part is actually used.
Efficiency: a measure of the fraction of time for which a PE is usefully employed.
Cost: the product of parallel run time and the number of processing elements used.
![image-20250327194312962](./hpc-2025-parallel-computing/image-20250327194312962.webp)

View File

@@ -1,79 +0,0 @@
---
title: High Performance Computing 25 SP Potpourri
date: 2025-08-31T13:51:29.8809980+08:00
tags:
- 高性能计算
- 学习资料
---
Potpourri has a good taste.
<!--more-->
## Heterogeneous System Architecture
![image-20250612185019968](./hpc-2025-potpourri/image-20250612185019968.webp)
The goals of the HSA:
- Enable power efficient performance.
- Improve programmability of heterogeneous processors.
- Increase the portability of code across processors and platforms.
- Increase the pervasiveness of heterogeneous solutions.
### The Runtime Stack
![image-20250612185221643](./hpc-2025-potpourri/image-20250612185221643.webp)
## Accelerated Processing Unit
A processor that combines the CPU and the GPU elements into a single architecture.
![image-20250612185743675](./hpc-2025-potpourri/image-20250612185743675.webp)
## Intel Xeon Phi
The goal:
- Leverage X86 architecture and existing X86 programming models.
- Dedicate much of the silicon to floating point ops.
- Cache coherent.
- Increase floating-point throughput.
- Strip expensive features.
The reality:
- 10s of x86-based cores.
- Very high-bandwidth local GDDR5 memory.
- The card runs a modified embedded Linux.
## Deep Learning: Deep Neural Networks
The network can used as a computer.
## Tensor Processing Unit
A custom ASIC for the phase of Neural Networks (AI accelerator).
### TPUv1 Architecture
![image-20250612191035632](./hpc-2025-potpourri/image-20250612191035632.webp)
### TPUv2 Architecture
![image-20250612191118473](./hpc-2025-potpourri/image-20250612191118473.webp)
Advantages of TPU:
- Allows to make predications very quickly and respond within fraction of a second.
- Accelerate performance of linear computation, key of machine learning applications.
- Minimize the time to accuracy when you train large and complex network models.
Disadvantages of TPU:
- Linear algebra that requires heavy branching or are not computed on the basis of element wise algebra.
- Non-dominated matrix multiplication is not likely to perform well on TPUs.
- Workloads that access memory using sparse technique.
- Workloads that use highly precise arithmetic operations.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Some files were not shown because too many files have changed in this diff Show More