Annotation of sys/arch/m68k/fpsp/stan.sa, Revision 1.1.1.1
1.1 nbrk 1: * $OpenBSD: stan.sa,v 1.3 2003/11/07 10:36:10 miod Exp $
2: * $NetBSD: stan.sa,v 1.3 1994/10/26 07:50:10 cgd Exp $
3:
4: * MOTOROLA MICROPROCESSOR & MEMORY TECHNOLOGY GROUP
5: * M68000 Hi-Performance Microprocessor Division
6: * M68040 Software Package
7: *
8: * M68040 Software Package Copyright (c) 1993, 1994 Motorola Inc.
9: * All rights reserved.
10: *
11: * THE SOFTWARE is provided on an "AS IS" basis and without warranty.
12: * To the maximum extent permitted by applicable law,
13: * MOTOROLA DISCLAIMS ALL WARRANTIES WHETHER EXPRESS OR IMPLIED,
14: * INCLUDING IMPLIED WARRANTIES OF MERCHANTABILITY OR FITNESS FOR A
15: * PARTICULAR PURPOSE and any warranty against infringement with
16: * regard to the SOFTWARE (INCLUDING ANY MODIFIED VERSIONS THEREOF)
17: * and any accompanying written materials.
18: *
19: * To the maximum extent permitted by applicable law,
20: * IN NO EVENT SHALL MOTOROLA BE LIABLE FOR ANY DAMAGES WHATSOEVER
21: * (INCLUDING WITHOUT LIMITATION, DAMAGES FOR LOSS OF BUSINESS
22: * PROFITS, BUSINESS INTERRUPTION, LOSS OF BUSINESS INFORMATION, OR
23: * OTHER PECUNIARY LOSS) ARISING OF THE USE OR INABILITY TO USE THE
24: * SOFTWARE. Motorola assumes no responsibility for the maintenance
25: * and support of the SOFTWARE.
26: *
27: * You are hereby granted a copyright license to use, modify, and
28: * distribute the SOFTWARE so long as this entire notice is retained
29: * without alteration in any modified and/or redistributed versions,
30: * and that such modified versions are clearly identified as such.
31: * No licenses are granted by implication, estoppel or otherwise
32: * under any patents or trademarks of Motorola, Inc.
33:
34: *
35: * stan.sa 3.3 7/29/91
36: *
37: * The entry point stan computes the tangent of
38: * an input argument;
39: * stand does the same except for denormalized input.
40: *
41: * Input: Double-extended number X in location pointed to
42: * by address register a0.
43: *
44: * Output: The value tan(X) returned in floating-point register Fp0.
45: *
46: * Accuracy and Monotonicity: The returned result is within 3 ulp in
47: * 64 significant bit, i.e. within 0.5001 ulp to 53 bits if the
48: * result is subsequently rounded to double precision. The
49: * result is provably monotonic in double precision.
50: *
51: * Speed: The program sTAN takes approximately 170 cycles for
52: * input argument X such that |X| < 15Pi, which is the usual
53: * situation.
54: *
55: * Algorithm:
56: *
57: * 1. If |X| >= 15Pi or |X| < 2**(-40), go to 6.
58: *
59: * 2. Decompose X as X = N(Pi/2) + r where |r| <= Pi/4. Let
60: * k = N mod 2, so in particular, k = 0 or 1.
61: *
62: * 3. If k is odd, go to 5.
63: *
64: * 4. (k is even) Tan(X) = tan(r) and tan(r) is approximated by a
65: * rational function U/V where
66: * U = r + r*s*(P1 + s*(P2 + s*P3)), and
67: * V = 1 + s*(Q1 + s*(Q2 + s*(Q3 + s*Q4))), s = r*r.
68: * Exit.
69: *
70: * 4. (k is odd) Tan(X) = -cot(r). Since tan(r) is approximated by a
71: * rational function U/V where
72: * U = r + r*s*(P1 + s*(P2 + s*P3)), and
73: * V = 1 + s*(Q1 + s*(Q2 + s*(Q3 + s*Q4))), s = r*r,
74: * -Cot(r) = -V/U. Exit.
75: *
76: * 6. If |X| > 1, go to 8.
77: *
78: * 7. (|X|<2**(-40)) Tan(X) = X. Exit.
79: *
80: * 8. Overwrite X by X := X rem 2Pi. Now that |X| <= Pi, go back to 2.
81: *
82:
83: STAN IDNT 2,1 Motorola 040 Floating Point Software Package
84:
85: section 8
86:
87: include fpsp.h
88:
89: BOUNDS1 DC.L $3FD78000,$4004BC7E
90: TWOBYPI DC.L $3FE45F30,$6DC9C883
91:
92: TANQ4 DC.L $3EA0B759,$F50F8688
93: TANP3 DC.L $BEF2BAA5,$A8924F04
94:
95: TANQ3 DC.L $BF346F59,$B39BA65F,$00000000,$00000000
96:
97: TANP2 DC.L $3FF60000,$E073D3FC,$199C4A00,$00000000
98:
99: TANQ2 DC.L $3FF90000,$D23CD684,$15D95FA1,$00000000
100:
101: TANP1 DC.L $BFFC0000,$8895A6C5,$FB423BCA,$00000000
102:
103: TANQ1 DC.L $BFFD0000,$EEF57E0D,$A84BC8CE,$00000000
104:
105: INVTWOPI DC.L $3FFC0000,$A2F9836E,$4E44152A,$00000000
106:
107: TWOPI1 DC.L $40010000,$C90FDAA2,$00000000,$00000000
108: TWOPI2 DC.L $3FDF0000,$85A308D4,$00000000,$00000000
109:
110: *--N*PI/2, -32 <= N <= 32, IN A LEADING TERM IN EXT. AND TRAILING
111: *--TERM IN SGL. NOTE THAT PI IS 64-BIT LONG, THUS N*PI/2 IS AT
112: *--MOST 69 BITS LONG.
113: xdef PITBL
114: PITBL:
115: DC.L $C0040000,$C90FDAA2,$2168C235,$21800000
116: DC.L $C0040000,$C2C75BCD,$105D7C23,$A0D00000
117: DC.L $C0040000,$BC7EDCF7,$FF523611,$A1E80000
118: DC.L $C0040000,$B6365E22,$EE46F000,$21480000
119: DC.L $C0040000,$AFEDDF4D,$DD3BA9EE,$A1200000
120: DC.L $C0040000,$A9A56078,$CC3063DD,$21FC0000
121: DC.L $C0040000,$A35CE1A3,$BB251DCB,$21100000
122: DC.L $C0040000,$9D1462CE,$AA19D7B9,$A1580000
123: DC.L $C0040000,$96CBE3F9,$990E91A8,$21E00000
124: DC.L $C0040000,$90836524,$88034B96,$20B00000
125: DC.L $C0040000,$8A3AE64F,$76F80584,$A1880000
126: DC.L $C0040000,$83F2677A,$65ECBF73,$21C40000
127: DC.L $C0030000,$FB53D14A,$A9C2F2C2,$20000000
128: DC.L $C0030000,$EEC2D3A0,$87AC669F,$21380000
129: DC.L $C0030000,$E231D5F6,$6595DA7B,$A1300000
130: DC.L $C0030000,$D5A0D84C,$437F4E58,$9FC00000
131: DC.L $C0030000,$C90FDAA2,$2168C235,$21000000
132: DC.L $C0030000,$BC7EDCF7,$FF523611,$A1680000
133: DC.L $C0030000,$AFEDDF4D,$DD3BA9EE,$A0A00000
134: DC.L $C0030000,$A35CE1A3,$BB251DCB,$20900000
135: DC.L $C0030000,$96CBE3F9,$990E91A8,$21600000
136: DC.L $C0030000,$8A3AE64F,$76F80584,$A1080000
137: DC.L $C0020000,$FB53D14A,$A9C2F2C2,$1F800000
138: DC.L $C0020000,$E231D5F6,$6595DA7B,$A0B00000
139: DC.L $C0020000,$C90FDAA2,$2168C235,$20800000
140: DC.L $C0020000,$AFEDDF4D,$DD3BA9EE,$A0200000
141: DC.L $C0020000,$96CBE3F9,$990E91A8,$20E00000
142: DC.L $C0010000,$FB53D14A,$A9C2F2C2,$1F000000
143: DC.L $C0010000,$C90FDAA2,$2168C235,$20000000
144: DC.L $C0010000,$96CBE3F9,$990E91A8,$20600000
145: DC.L $C0000000,$C90FDAA2,$2168C235,$1F800000
146: DC.L $BFFF0000,$C90FDAA2,$2168C235,$1F000000
147: DC.L $00000000,$00000000,$00000000,$00000000
148: DC.L $3FFF0000,$C90FDAA2,$2168C235,$9F000000
149: DC.L $40000000,$C90FDAA2,$2168C235,$9F800000
150: DC.L $40010000,$96CBE3F9,$990E91A8,$A0600000
151: DC.L $40010000,$C90FDAA2,$2168C235,$A0000000
152: DC.L $40010000,$FB53D14A,$A9C2F2C2,$9F000000
153: DC.L $40020000,$96CBE3F9,$990E91A8,$A0E00000
154: DC.L $40020000,$AFEDDF4D,$DD3BA9EE,$20200000
155: DC.L $40020000,$C90FDAA2,$2168C235,$A0800000
156: DC.L $40020000,$E231D5F6,$6595DA7B,$20B00000
157: DC.L $40020000,$FB53D14A,$A9C2F2C2,$9F800000
158: DC.L $40030000,$8A3AE64F,$76F80584,$21080000
159: DC.L $40030000,$96CBE3F9,$990E91A8,$A1600000
160: DC.L $40030000,$A35CE1A3,$BB251DCB,$A0900000
161: DC.L $40030000,$AFEDDF4D,$DD3BA9EE,$20A00000
162: DC.L $40030000,$BC7EDCF7,$FF523611,$21680000
163: DC.L $40030000,$C90FDAA2,$2168C235,$A1000000
164: DC.L $40030000,$D5A0D84C,$437F4E58,$1FC00000
165: DC.L $40030000,$E231D5F6,$6595DA7B,$21300000
166: DC.L $40030000,$EEC2D3A0,$87AC669F,$A1380000
167: DC.L $40030000,$FB53D14A,$A9C2F2C2,$A0000000
168: DC.L $40040000,$83F2677A,$65ECBF73,$A1C40000
169: DC.L $40040000,$8A3AE64F,$76F80584,$21880000
170: DC.L $40040000,$90836524,$88034B96,$A0B00000
171: DC.L $40040000,$96CBE3F9,$990E91A8,$A1E00000
172: DC.L $40040000,$9D1462CE,$AA19D7B9,$21580000
173: DC.L $40040000,$A35CE1A3,$BB251DCB,$A1100000
174: DC.L $40040000,$A9A56078,$CC3063DD,$A1FC0000
175: DC.L $40040000,$AFEDDF4D,$DD3BA9EE,$21200000
176: DC.L $40040000,$B6365E22,$EE46F000,$A1480000
177: DC.L $40040000,$BC7EDCF7,$FF523611,$21E80000
178: DC.L $40040000,$C2C75BCD,$105D7C23,$20D00000
179: DC.L $40040000,$C90FDAA2,$2168C235,$A1800000
180:
181: INARG equ FP_SCR4
182:
183: TWOTO63 equ L_SCR1
184: ENDFLAG equ L_SCR2
185: N equ L_SCR3
186:
187: xref t_frcinx
188: xref t_extdnrm
189:
190: xdef stand
191: stand:
192: *--TAN(X) = X FOR DENORMALIZED X
193:
194: bra t_extdnrm
195:
196: xdef stan
197: stan:
198: FMOVE.X (a0),FP0 ...LOAD INPUT
199:
200: MOVE.L (A0),D0
201: MOVE.W 4(A0),D0
202: ANDI.L #$7FFFFFFF,D0
203:
204: CMPI.L #$3FD78000,D0 ...|X| >= 2**(-40)?
205: BGE.B TANOK1
206: BRA.W TANSM
207: TANOK1:
208: CMPI.L #$4004BC7E,D0 ...|X| < 15 PI?
209: BLT.B TANMAIN
210: BRA.W REDUCEX
211:
212:
213: TANMAIN:
214: *--THIS IS THE USUAL CASE, |X| <= 15 PI.
215: *--THE ARGUMENT REDUCTION IS DONE BY TABLE LOOK UP.
216: FMOVE.X FP0,FP1
217: FMUL.D TWOBYPI,FP1 ...X*2/PI
218:
219: *--HIDE THE NEXT TWO INSTRUCTIONS
220: lea.l PITBL+$200,a1 ...TABLE OF N*PI/2, N = -32,...,32
221:
222: *--FP1 IS NOW READY
223: FMOVE.L FP1,D0 ...CONVERT TO INTEGER
224:
225: ASL.L #4,D0
226: ADDA.L D0,a1 ...ADDRESS N*PIBY2 IN Y1, Y2
227:
228: FSUB.X (a1)+,FP0 ...X-Y1
229: *--HIDE THE NEXT ONE
230:
231: FSUB.S (a1),FP0 ...FP0 IS R = (X-Y1)-Y2
232:
233: ROR.L #5,D0
234: ANDI.L #$80000000,D0 ...D0 WAS ODD IFF D0 < 0
235:
236: TANCONT:
237:
238: TST.L D0
239: BLT.W NODD
240:
241: FMOVE.X FP0,FP1
242: FMUL.X FP1,FP1 ...S = R*R
243:
244: FMOVE.D TANQ4,FP3
245: FMOVE.D TANP3,FP2
246:
247: FMUL.X FP1,FP3 ...SQ4
248: FMUL.X FP1,FP2 ...SP3
249:
250: FADD.D TANQ3,FP3 ...Q3+SQ4
251: FADD.X TANP2,FP2 ...P2+SP3
252:
253: FMUL.X FP1,FP3 ...S(Q3+SQ4)
254: FMUL.X FP1,FP2 ...S(P2+SP3)
255:
256: FADD.X TANQ2,FP3 ...Q2+S(Q3+SQ4)
257: FADD.X TANP1,FP2 ...P1+S(P2+SP3)
258:
259: FMUL.X FP1,FP3 ...S(Q2+S(Q3+SQ4))
260: FMUL.X FP1,FP2 ...S(P1+S(P2+SP3))
261:
262: FADD.X TANQ1,FP3 ...Q1+S(Q2+S(Q3+SQ4))
263: FMUL.X FP0,FP2 ...RS(P1+S(P2+SP3))
264:
265: FMUL.X FP3,FP1 ...S(Q1+S(Q2+S(Q3+SQ4)))
266:
267:
268: FADD.X FP2,FP0 ...R+RS(P1+S(P2+SP3))
269:
270:
271: FADD.S #:3F800000,FP1 ...1+S(Q1+...)
272:
273: FMOVE.L d1,fpcr ;restore users exceptions
274: FDIV.X FP1,FP0 ;last inst - possible exception set
275:
276: bra t_frcinx
277:
278: NODD:
279: FMOVE.X FP0,FP1
280: FMUL.X FP0,FP0 ...S = R*R
281:
282: FMOVE.D TANQ4,FP3
283: FMOVE.D TANP3,FP2
284:
285: FMUL.X FP0,FP3 ...SQ4
286: FMUL.X FP0,FP2 ...SP3
287:
288: FADD.D TANQ3,FP3 ...Q3+SQ4
289: FADD.X TANP2,FP2 ...P2+SP3
290:
291: FMUL.X FP0,FP3 ...S(Q3+SQ4)
292: FMUL.X FP0,FP2 ...S(P2+SP3)
293:
294: FADD.X TANQ2,FP3 ...Q2+S(Q3+SQ4)
295: FADD.X TANP1,FP2 ...P1+S(P2+SP3)
296:
297: FMUL.X FP0,FP3 ...S(Q2+S(Q3+SQ4))
298: FMUL.X FP0,FP2 ...S(P1+S(P2+SP3))
299:
300: FADD.X TANQ1,FP3 ...Q1+S(Q2+S(Q3+SQ4))
301: FMUL.X FP1,FP2 ...RS(P1+S(P2+SP3))
302:
303: FMUL.X FP3,FP0 ...S(Q1+S(Q2+S(Q3+SQ4)))
304:
305:
306: FADD.X FP2,FP1 ...R+RS(P1+S(P2+SP3))
307: FADD.S #:3F800000,FP0 ...1+S(Q1+...)
308:
309:
310: FMOVE.X FP1,-(sp)
311: EORI.L #$80000000,(sp)
312:
313: FMOVE.L d1,fpcr ;restore users exceptions
314: FDIV.X (sp)+,FP0 ;last inst - possible exception set
315:
316: bra t_frcinx
317:
318: TANBORS:
319: *--IF |X| > 15PI, WE USE THE GENERAL ARGUMENT REDUCTION.
320: *--IF |X| < 2**(-40), RETURN X OR 1.
321: CMPI.L #$3FFF8000,D0
322: BGT.B REDUCEX
323:
324: TANSM:
325:
326: FMOVE.X FP0,-(sp)
327: FMOVE.L d1,fpcr ;restore users exceptions
328: FMOVE.X (sp)+,FP0 ;last inst - posibble exception set
329:
330: bra t_frcinx
331:
332:
333: REDUCEX:
334: *--WHEN REDUCEX IS USED, THE CODE WILL INEVITABLY BE SLOW.
335: *--THIS REDUCTION METHOD, HOWEVER, IS MUCH FASTER THAN USING
336: *--THE REMAINDER INSTRUCTION WHICH IS NOW IN SOFTWARE.
337:
338: FMOVEM.X FP2-FP5,-(A7) ...save FP2 through FP5
339: MOVE.L D2,-(A7)
340: FMOVE.S #:00000000,FP1
341:
342: *--If compact form of abs(arg) in d0=$7ffeffff, argument is so large that
343: *--there is a danger of unwanted overflow in first LOOP iteration. In this
344: *--case, reduce argument by one remainder step to make subsequent reduction
345: *--safe.
346: cmpi.l #$7ffeffff,d0 ;is argument dangerously large?
347: bne.b LOOP
348: move.l #$7ffe0000,FP_SCR2(a6) ;yes
349: * ;create 2**16383*PI/2
350: move.l #$c90fdaa2,FP_SCR2+4(a6)
351: clr.l FP_SCR2+8(a6)
352: ftst.x fp0 ;test sign of argument
353: move.l #$7fdc0000,FP_SCR3(a6) ;create low half of 2**16383*
354: * ;PI/2 at FP_SCR3
355: move.l #$85a308d3,FP_SCR3+4(a6)
356: clr.l FP_SCR3+8(a6)
357: fblt.w red_neg
358: or.w #$8000,FP_SCR2(a6) ;positive arg
359: or.w #$8000,FP_SCR3(a6)
360: red_neg:
361: fadd.x FP_SCR2(a6),fp0 ;high part of reduction is exact
362: fmove.x fp0,fp1 ;save high result in fp1
363: fadd.x FP_SCR3(a6),fp0 ;low part of reduction
364: fsub.x fp0,fp1 ;determine low component of result
365: fadd.x FP_SCR3(a6),fp1 ;fp0/fp1 are reduced argument.
366:
367: *--ON ENTRY, FP0 IS X, ON RETURN, FP0 IS X REM PI/2, |X| <= PI/4.
368: *--integer quotient will be stored in N
369: *--Intermeditate remainder is 66-bit long; (R,r) in (FP0,FP1)
370:
371: LOOP:
372: FMOVE.X FP0,INARG(a6) ...+-2**K * F, 1 <= F < 2
373: MOVE.W INARG(a6),D0
374: MOVE.L D0,A1 ...save a copy of D0
375: ANDI.L #$00007FFF,D0
376: SUBI.L #$00003FFF,D0 ...D0 IS K
377: CMPI.L #28,D0
378: BLE.B LASTLOOP
379: CONTLOOP:
380: SUBI.L #27,D0 ...D0 IS L := K-27
381: CLR.L ENDFLAG(a6)
382: BRA.B WORK
383: LASTLOOP:
384: CLR.L D0 ...D0 IS L := 0
385: MOVE.L #1,ENDFLAG(a6)
386:
387: WORK:
388: *--FIND THE REMAINDER OF (R,r) W.R.T. 2**L * (PI/2). L IS SO CHOSEN
389: *--THAT INT( X * (2/PI) / 2**(L) ) < 2**29.
390:
391: *--CREATE 2**(-L) * (2/PI), SIGN(INARG)*2**(63),
392: *--2**L * (PIby2_1), 2**L * (PIby2_2)
393:
394: MOVE.L #$00003FFE,D2 ...BIASED EXPO OF 2/PI
395: SUB.L D0,D2 ...BIASED EXPO OF 2**(-L)*(2/PI)
396:
397: MOVE.L #$A2F9836E,FP_SCR1+4(a6)
398: MOVE.L #$4E44152A,FP_SCR1+8(a6)
399: MOVE.W D2,FP_SCR1(a6) ...FP_SCR1 is 2**(-L)*(2/PI)
400:
401: FMOVE.X FP0,FP2
402: FMUL.X FP_SCR1(a6),FP2
403: *--WE MUST NOW FIND INT(FP2). SINCE WE NEED THIS VALUE IN
404: *--FLOATING POINT FORMAT, THE TWO FMOVE'S FMOVE.L FP <--> N
405: *--WILL BE TOO INEFFICIENT. THE WAY AROUND IT IS THAT
406: *--(SIGN(INARG)*2**63 + FP2) - SIGN(INARG)*2**63 WILL GIVE
407: *--US THE DESIRED VALUE IN FLOATING POINT.
408:
409: *--HIDE SIX CYCLES OF INSTRUCTION
410: MOVE.L A1,D2
411: SWAP D2
412: ANDI.L #$80000000,D2
413: ORI.L #$5F000000,D2 ...D2 IS SIGN(INARG)*2**63 IN SGL
414: MOVE.L D2,TWOTO63(a6)
415:
416: MOVE.L D0,D2
417: ADDI.L #$00003FFF,D2 ...BIASED EXPO OF 2**L * (PI/2)
418:
419: *--FP2 IS READY
420: FADD.S TWOTO63(a6),FP2 ...THE FRACTIONAL PART OF FP1 IS ROUNDED
421:
422: *--HIDE 4 CYCLES OF INSTRUCTION; creating 2**(L)*Piby2_1 and 2**(L)*Piby2_2
423: MOVE.W D2,FP_SCR2(a6)
424: CLR.W FP_SCR2+2(a6)
425: MOVE.L #$C90FDAA2,FP_SCR2+4(a6)
426: CLR.L FP_SCR2+8(a6) ...FP_SCR2 is 2**(L) * Piby2_1
427:
428: *--FP2 IS READY
429: FSUB.S TWOTO63(a6),FP2 ...FP2 is N
430:
431: ADDI.L #$00003FDD,D0
432: MOVE.W D0,FP_SCR3(a6)
433: CLR.W FP_SCR3+2(a6)
434: MOVE.L #$85A308D3,FP_SCR3+4(a6)
435: CLR.L FP_SCR3+8(a6) ...FP_SCR3 is 2**(L) * Piby2_2
436:
437: MOVE.L ENDFLAG(a6),D0
438:
439: *--We are now ready to perform (R+r) - N*P1 - N*P2, P1 = 2**(L) * Piby2_1 and
440: *--P2 = 2**(L) * Piby2_2
441: FMOVE.X FP2,FP4
442: FMul.X FP_SCR2(a6),FP4 ...W = N*P1
443: FMove.X FP2,FP5
444: FMul.X FP_SCR3(a6),FP5 ...w = N*P2
445: FMove.X FP4,FP3
446: *--we want P+p = W+w but |p| <= half ulp of P
447: *--Then, we need to compute A := R-P and a := r-p
448: FAdd.X FP5,FP3 ...FP3 is P
449: FSub.X FP3,FP4 ...W-P
450:
451: FSub.X FP3,FP0 ...FP0 is A := R - P
452: FAdd.X FP5,FP4 ...FP4 is p = (W-P)+w
453:
454: FMove.X FP0,FP3 ...FP3 A
455: FSub.X FP4,FP1 ...FP1 is a := r - p
456:
457: *--Now we need to normalize (A,a) to "new (R,r)" where R+r = A+a but
458: *--|r| <= half ulp of R.
459: FAdd.X FP1,FP0 ...FP0 is R := A+a
460: *--No need to calculate r if this is the last loop
461: TST.L D0
462: BGT.W RESTORE
463:
464: *--Need to calculate r
465: FSub.X FP0,FP3 ...A-R
466: FAdd.X FP3,FP1 ...FP1 is r := (A-R)+a
467: BRA.W LOOP
468:
469: RESTORE:
470: FMOVE.L FP2,N(a6)
471: MOVE.L (A7)+,D2
472: FMOVEM.X (A7)+,FP2-FP5
473:
474:
475: MOVE.L N(a6),D0
476: ROR.L #1,D0
477:
478:
479: BRA.W TANCONT
480:
481: end
CVSweb